aboutsummaryrefslogtreecommitdiffstats
path: root/net/core
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 18:20:36 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 18:20:36 -0400
commit1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree0bba044c4ce775e45a88a51686b5d9f90697ea9d /net/core
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile17
-rw-r--r--net/core/datagram.c482
-rw-r--r--net/core/dev.c3359
-rw-r--r--net/core/dev_mcast.c299
-rw-r--r--net/core/dst.c276
-rw-r--r--net/core/dv.c548
-rw-r--r--net/core/ethtool.c819
-rw-r--r--net/core/filter.c432
-rw-r--r--net/core/flow.c371
-rw-r--r--net/core/gen_estimator.c250
-rw-r--r--net/core/gen_stats.c239
-rw-r--r--net/core/iovec.c239
-rw-r--r--net/core/link_watch.c137
-rw-r--r--net/core/neighbour.c2362
-rw-r--r--net/core/net-sysfs.c461
-rw-r--r--net/core/netfilter.c799
-rw-r--r--net/core/netpoll.c735
-rw-r--r--net/core/pktgen.c3132
-rw-r--r--net/core/rtnetlink.c711
-rw-r--r--net/core/scm.c291
-rw-r--r--net/core/skbuff.c1460
-rw-r--r--net/core/sock.c1565
-rw-r--r--net/core/stream.c287
-rw-r--r--net/core/sysctl_net_core.c182
-rw-r--r--net/core/utils.c155
-rw-r--r--net/core/wireless.c1459
26 files changed, 21067 insertions, 0 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
new file mode 100644
index 000000000000..81f03243fe2f
--- /dev/null
+++ b/net/core/Makefile
@@ -0,0 +1,17 @@
1#
2# Makefile for the Linux networking core.
3#
4
5obj-y := sock.o skbuff.o iovec.o datagram.o stream.o scm.o gen_stats.o gen_estimator.o
6
7obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
8
9obj-y += flow.o dev.o ethtool.o dev_mcast.o dst.o \
10 neighbour.o rtnetlink.o utils.o link_watch.o filter.o
11
12obj-$(CONFIG_SYSFS) += net-sysfs.o
13obj-$(CONFIG_NETFILTER) += netfilter.o
14obj-$(CONFIG_NET_DIVERT) += dv.o
15obj-$(CONFIG_NET_PKTGEN) += pktgen.o
16obj-$(CONFIG_NET_RADIO) += wireless.o
17obj-$(CONFIG_NETPOLL) += netpoll.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
new file mode 100644
index 000000000000..d1bfd279cc1a
--- /dev/null
+++ b/net/core/datagram.c
@@ -0,0 +1,482 @@
1/*
2 * SUCS NET3:
3 *
4 * Generic datagram handling routines. These are generic for all
5 * protocols. Possibly a generic IP version on top of these would
6 * make sense. Not tonight however 8-).
7 * This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and
8 * NetROM layer all have identical poll code and mostly
9 * identical recvmsg() code. So we share it here. The poll was
10 * shared before but buried in udp.c so I moved it.
11 *
12 * Authors: Alan Cox <alan@redhat.com>. (datagram_poll() from old
13 * udp.c code)
14 *
15 * Fixes:
16 * Alan Cox : NULL return from skb_peek_copy()
17 * understood
18 * Alan Cox : Rewrote skb_read_datagram to avoid the
19 * skb_peek_copy stuff.
20 * Alan Cox : Added support for SOCK_SEQPACKET.
21 * IPX can no longer use the SO_TYPE hack
22 * but AX.25 now works right, and SPX is
23 * feasible.
24 * Alan Cox : Fixed write poll of non IP protocol
25 * crash.
26 * Florian La Roche: Changed for my new skbuff handling.
27 * Darryl Miles : Fixed non-blocking SOCK_SEQPACKET.
28 * Linus Torvalds : BSD semantic fixes.
29 * Alan Cox : Datagram iovec handling
30 * Darryl Miles : Fixed non-blocking SOCK_STREAM.
31 * Alan Cox : POSIXisms
32 * Pete Wyckoff : Unconnected accept() fix.
33 *
34 */
35
36#include <linux/module.h>
37#include <linux/types.h>
38#include <linux/kernel.h>
39#include <asm/uaccess.h>
40#include <asm/system.h>
41#include <linux/mm.h>
42#include <linux/interrupt.h>
43#include <linux/errno.h>
44#include <linux/sched.h>
45#include <linux/inet.h>
46#include <linux/tcp.h>
47#include <linux/netdevice.h>
48#include <linux/rtnetlink.h>
49#include <linux/poll.h>
50#include <linux/highmem.h>
51
52#include <net/protocol.h>
53#include <linux/skbuff.h>
54#include <net/sock.h>
55#include <net/checksum.h>
56
57
58/*
59 * Is a socket 'connection oriented' ?
60 */
61static inline int connection_based(struct sock *sk)
62{
63 return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM;
64}
65
66/*
67 * Wait for a packet..
68 */
69static int wait_for_packet(struct sock *sk, int *err, long *timeo_p)
70{
71 int error;
72 DEFINE_WAIT(wait);
73
74 prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
75
76 /* Socket errors? */
77 error = sock_error(sk);
78 if (error)
79 goto out_err;
80
81 if (!skb_queue_empty(&sk->sk_receive_queue))
82 goto out;
83
84 /* Socket shut down? */
85 if (sk->sk_shutdown & RCV_SHUTDOWN)
86 goto out_noerr;
87
88 /* Sequenced packets can come disconnected.
89 * If so we report the problem
90 */
91 error = -ENOTCONN;
92 if (connection_based(sk) &&
93 !(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN))
94 goto out_err;
95
96 /* handle signals */
97 if (signal_pending(current))
98 goto interrupted;
99
100 error = 0;
101 *timeo_p = schedule_timeout(*timeo_p);
102out:
103 finish_wait(sk->sk_sleep, &wait);
104 return error;
105interrupted:
106 error = sock_intr_errno(*timeo_p);
107out_err:
108 *err = error;
109 goto out;
110out_noerr:
111 *err = 0;
112 error = 1;
113 goto out;
114}
115
116/**
117 * skb_recv_datagram - Receive a datagram skbuff
118 * @sk - socket
119 * @flags - MSG_ flags
120 * @noblock - blocking operation?
121 * @err - error code returned
122 *
123 * Get a datagram skbuff, understands the peeking, nonblocking wakeups
124 * and possible races. This replaces identical code in packet, raw and
125 * udp, as well as the IPX AX.25 and Appletalk. It also finally fixes
126 * the long standing peek and read race for datagram sockets. If you
127 * alter this routine remember it must be re-entrant.
128 *
129 * This function will lock the socket if a skb is returned, so the caller
130 * needs to unlock the socket in that case (usually by calling
131 * skb_free_datagram)
132 *
133 * * It does not lock socket since today. This function is
134 * * free of race conditions. This measure should/can improve
135 * * significantly datagram socket latencies at high loads,
136 * * when data copying to user space takes lots of time.
137 * * (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet
138 * * 8) Great win.)
139 * * --ANK (980729)
140 *
141 * The order of the tests when we find no data waiting are specified
142 * quite explicitly by POSIX 1003.1g, don't change them without having
143 * the standard around please.
144 */
145struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags,
146 int noblock, int *err)
147{
148 struct sk_buff *skb;
149 long timeo;
150 /*
151 * Caller is allowed not to check sk->sk_err before skb_recv_datagram()
152 */
153 int error = sock_error(sk);
154
155 if (error)
156 goto no_packet;
157
158 timeo = sock_rcvtimeo(sk, noblock);
159
160 do {
161 /* Again only user level code calls this function, so nothing
162 * interrupt level will suddenly eat the receive_queue.
163 *
164 * Look at current nfs client by the way...
165 * However, this function was corrent in any case. 8)
166 */
167 if (flags & MSG_PEEK) {
168 unsigned long cpu_flags;
169
170 spin_lock_irqsave(&sk->sk_receive_queue.lock,
171 cpu_flags);
172 skb = skb_peek(&sk->sk_receive_queue);
173 if (skb)
174 atomic_inc(&skb->users);
175 spin_unlock_irqrestore(&sk->sk_receive_queue.lock,
176 cpu_flags);
177 } else
178 skb = skb_dequeue(&sk->sk_receive_queue);
179
180 if (skb)
181 return skb;
182
183 /* User doesn't want to wait */
184 error = -EAGAIN;
185 if (!timeo)
186 goto no_packet;
187
188 } while (!wait_for_packet(sk, err, &timeo));
189
190 return NULL;
191
192no_packet:
193 *err = error;
194 return NULL;
195}
196
197void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
198{
199 kfree_skb(skb);
200}
201
202/**
203 * skb_copy_datagram_iovec - Copy a datagram to an iovec.
204 * @skb - buffer to copy
205 * @offset - offset in the buffer to start copying from
206 * @iovec - io vector to copy to
207 * @len - amount of data to copy from buffer to iovec
208 *
209 * Note: the iovec is modified during the copy.
210 */
211int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
212 struct iovec *to, int len)
213{
214 int start = skb_headlen(skb);
215 int i, copy = start - offset;
216
217 /* Copy header. */
218 if (copy > 0) {
219 if (copy > len)
220 copy = len;
221 if (memcpy_toiovec(to, skb->data + offset, copy))
222 goto fault;
223 if ((len -= copy) == 0)
224 return 0;
225 offset += copy;
226 }
227
228 /* Copy paged appendix. Hmm... why does this look so complicated? */
229 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
230 int end;
231
232 BUG_TRAP(start <= offset + len);
233
234 end = start + skb_shinfo(skb)->frags[i].size;
235 if ((copy = end - offset) > 0) {
236 int err;
237 u8 *vaddr;
238 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
239 struct page *page = frag->page;
240
241 if (copy > len)
242 copy = len;
243 vaddr = kmap(page);
244 err = memcpy_toiovec(to, vaddr + frag->page_offset +
245 offset - start, copy);
246 kunmap(page);
247 if (err)
248 goto fault;
249 if (!(len -= copy))
250 return 0;
251 offset += copy;
252 }
253 start = end;
254 }
255
256 if (skb_shinfo(skb)->frag_list) {
257 struct sk_buff *list = skb_shinfo(skb)->frag_list;
258
259 for (; list; list = list->next) {
260 int end;
261
262 BUG_TRAP(start <= offset + len);
263
264 end = start + list->len;
265 if ((copy = end - offset) > 0) {
266 if (copy > len)
267 copy = len;
268 if (skb_copy_datagram_iovec(list,
269 offset - start,
270 to, copy))
271 goto fault;
272 if ((len -= copy) == 0)
273 return 0;
274 offset += copy;
275 }
276 start = end;
277 }
278 }
279 if (!len)
280 return 0;
281
282fault:
283 return -EFAULT;
284}
285
286static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
287 u8 __user *to, int len,
288 unsigned int *csump)
289{
290 int start = skb_headlen(skb);
291 int pos = 0;
292 int i, copy = start - offset;
293
294 /* Copy header. */
295 if (copy > 0) {
296 int err = 0;
297 if (copy > len)
298 copy = len;
299 *csump = csum_and_copy_to_user(skb->data + offset, to, copy,
300 *csump, &err);
301 if (err)
302 goto fault;
303 if ((len -= copy) == 0)
304 return 0;
305 offset += copy;
306 to += copy;
307 pos = copy;
308 }
309
310 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
311 int end;
312
313 BUG_TRAP(start <= offset + len);
314
315 end = start + skb_shinfo(skb)->frags[i].size;
316 if ((copy = end - offset) > 0) {
317 unsigned int csum2;
318 int err = 0;
319 u8 *vaddr;
320 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
321 struct page *page = frag->page;
322
323 if (copy > len)
324 copy = len;
325 vaddr = kmap(page);
326 csum2 = csum_and_copy_to_user(vaddr +
327 frag->page_offset +
328 offset - start,
329 to, copy, 0, &err);
330 kunmap(page);
331 if (err)
332 goto fault;
333 *csump = csum_block_add(*csump, csum2, pos);
334 if (!(len -= copy))
335 return 0;
336 offset += copy;
337 to += copy;
338 pos += copy;
339 }
340 start = end;
341 }
342
343 if (skb_shinfo(skb)->frag_list) {
344 struct sk_buff *list = skb_shinfo(skb)->frag_list;
345
346 for (; list; list=list->next) {
347 int end;
348
349 BUG_TRAP(start <= offset + len);
350
351 end = start + list->len;
352 if ((copy = end - offset) > 0) {
353 unsigned int csum2 = 0;
354 if (copy > len)
355 copy = len;
356 if (skb_copy_and_csum_datagram(list,
357 offset - start,
358 to, copy,
359 &csum2))
360 goto fault;
361 *csump = csum_block_add(*csump, csum2, pos);
362 if ((len -= copy) == 0)
363 return 0;
364 offset += copy;
365 to += copy;
366 pos += copy;
367 }
368 start = end;
369 }
370 }
371 if (!len)
372 return 0;
373
374fault:
375 return -EFAULT;
376}
377
378/**
379 * skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec.
380 * @skb - skbuff
381 * @hlen - hardware length
382 * @iovec - io vector
383 *
384 * Caller _must_ check that skb will fit to this iovec.
385 *
386 * Returns: 0 - success.
387 * -EINVAL - checksum failure.
388 * -EFAULT - fault during copy. Beware, in this case iovec
389 * can be modified!
390 */
391int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb,
392 int hlen, struct iovec *iov)
393{
394 unsigned int csum;
395 int chunk = skb->len - hlen;
396
397 /* Skip filled elements.
398 * Pretty silly, look at memcpy_toiovec, though 8)
399 */
400 while (!iov->iov_len)
401 iov++;
402
403 if (iov->iov_len < chunk) {
404 if ((unsigned short)csum_fold(skb_checksum(skb, 0, chunk + hlen,
405 skb->csum)))
406 goto csum_error;
407 if (skb_copy_datagram_iovec(skb, hlen, iov, chunk))
408 goto fault;
409 } else {
410 csum = csum_partial(skb->data, hlen, skb->csum);
411 if (skb_copy_and_csum_datagram(skb, hlen, iov->iov_base,
412 chunk, &csum))
413 goto fault;
414 if ((unsigned short)csum_fold(csum))
415 goto csum_error;
416 iov->iov_len -= chunk;
417 iov->iov_base += chunk;
418 }
419 return 0;
420csum_error:
421 return -EINVAL;
422fault:
423 return -EFAULT;
424}
425
426/**
427 * datagram_poll - generic datagram poll
428 * @file - file struct
429 * @sock - socket
430 * @wait - poll table
431 *
432 * Datagram poll: Again totally generic. This also handles
433 * sequenced packet sockets providing the socket receive queue
434 * is only ever holding data ready to receive.
435 *
436 * Note: when you _don't_ use this routine for this protocol,
437 * and you use a different write policy from sock_writeable()
438 * then please supply your own write_space callback.
439 */
440unsigned int datagram_poll(struct file *file, struct socket *sock,
441 poll_table *wait)
442{
443 struct sock *sk = sock->sk;
444 unsigned int mask;
445
446 poll_wait(file, sk->sk_sleep, wait);
447 mask = 0;
448
449 /* exceptional events? */
450 if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
451 mask |= POLLERR;
452 if (sk->sk_shutdown == SHUTDOWN_MASK)
453 mask |= POLLHUP;
454
455 /* readable? */
456 if (!skb_queue_empty(&sk->sk_receive_queue) ||
457 (sk->sk_shutdown & RCV_SHUTDOWN))
458 mask |= POLLIN | POLLRDNORM;
459
460 /* Connection-based need to check for termination and startup */
461 if (connection_based(sk)) {
462 if (sk->sk_state == TCP_CLOSE)
463 mask |= POLLHUP;
464 /* connection hasn't started yet? */
465 if (sk->sk_state == TCP_SYN_SENT)
466 return mask;
467 }
468
469 /* writable? */
470 if (sock_writeable(sk))
471 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
472 else
473 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
474
475 return mask;
476}
477
478EXPORT_SYMBOL(datagram_poll);
479EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec);
480EXPORT_SYMBOL(skb_copy_datagram_iovec);
481EXPORT_SYMBOL(skb_free_datagram);
482EXPORT_SYMBOL(skb_recv_datagram);
diff --git a/net/core/dev.c b/net/core/dev.c
new file mode 100644
index 000000000000..42344d903692
--- /dev/null
+++ b/net/core/dev.c
@@ -0,0 +1,3359 @@
1/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
76#include <asm/system.h>
77#include <linux/bitops.h>
78#include <linux/config.h>
79#include <linux/cpu.h>
80#include <linux/types.h>
81#include <linux/kernel.h>
82#include <linux/sched.h>
83#include <linux/string.h>
84#include <linux/mm.h>
85#include <linux/socket.h>
86#include <linux/sockios.h>
87#include <linux/errno.h>
88#include <linux/interrupt.h>
89#include <linux/if_ether.h>
90#include <linux/netdevice.h>
91#include <linux/etherdevice.h>
92#include <linux/notifier.h>
93#include <linux/skbuff.h>
94#include <net/sock.h>
95#include <linux/rtnetlink.h>
96#include <linux/proc_fs.h>
97#include <linux/seq_file.h>
98#include <linux/stat.h>
99#include <linux/if_bridge.h>
100#include <linux/divert.h>
101#include <net/dst.h>
102#include <net/pkt_sched.h>
103#include <net/checksum.h>
104#include <linux/highmem.h>
105#include <linux/init.h>
106#include <linux/kmod.h>
107#include <linux/module.h>
108#include <linux/kallsyms.h>
109#include <linux/netpoll.h>
110#include <linux/rcupdate.h>
111#include <linux/delay.h>
112#ifdef CONFIG_NET_RADIO
113#include <linux/wireless.h> /* Note : will define WIRELESS_EXT */
114#include <net/iw_handler.h>
115#endif /* CONFIG_NET_RADIO */
116#include <asm/current.h>
117
118/* This define, if set, will randomly drop a packet when congestion
119 * is more than moderate. It helps fairness in the multi-interface
120 * case when one of them is a hog, but it kills performance for the
121 * single interface case so it is off now by default.
122 */
123#undef RAND_LIE
124
125/* Setting this will sample the queue lengths and thus congestion
126 * via a timer instead of as each packet is received.
127 */
128#undef OFFLINE_SAMPLE
129
130/*
131 * The list of packet types we will receive (as opposed to discard)
132 * and the routines to invoke.
133 *
134 * Why 16. Because with 16 the only overlap we get on a hash of the
135 * low nibble of the protocol value is RARP/SNAP/X.25.
136 *
137 * NOTE: That is no longer true with the addition of VLAN tags. Not
138 * sure which should go first, but I bet it won't make much
139 * difference if we are running VLANs. The good news is that
140 * this protocol won't be in the list unless compiled in, so
141 * the average user (w/out VLANs) will not be adversly affected.
142 * --BLG
143 *
144 * 0800 IP
145 * 8100 802.1Q VLAN
146 * 0001 802.3
147 * 0002 AX.25
148 * 0004 802.2
149 * 8035 RARP
150 * 0005 SNAP
151 * 0805 X.25
152 * 0806 ARP
153 * 8137 IPX
154 * 0009 Localtalk
155 * 86DD IPv6
156 */
157
158static DEFINE_SPINLOCK(ptype_lock);
159static struct list_head ptype_base[16]; /* 16 way hashed list */
160static struct list_head ptype_all; /* Taps */
161
162#ifdef OFFLINE_SAMPLE
163static void sample_queue(unsigned long dummy);
164static struct timer_list samp_timer = TIMER_INITIALIZER(sample_queue, 0, 0);
165#endif
166
167/*
168 * The @dev_base list is protected by @dev_base_lock and the rtln
169 * semaphore.
170 *
171 * Pure readers hold dev_base_lock for reading.
172 *
173 * Writers must hold the rtnl semaphore while they loop through the
174 * dev_base list, and hold dev_base_lock for writing when they do the
175 * actual updates. This allows pure readers to access the list even
176 * while a writer is preparing to update it.
177 *
178 * To put it another way, dev_base_lock is held for writing only to
179 * protect against pure readers; the rtnl semaphore provides the
180 * protection against other writers.
181 *
182 * See, for example usages, register_netdevice() and
183 * unregister_netdevice(), which must be called with the rtnl
184 * semaphore held.
185 */
186struct net_device *dev_base;
187static struct net_device **dev_tail = &dev_base;
188DEFINE_RWLOCK(dev_base_lock);
189
190EXPORT_SYMBOL(dev_base);
191EXPORT_SYMBOL(dev_base_lock);
192
193#define NETDEV_HASHBITS 8
194static struct hlist_head dev_name_head[1<<NETDEV_HASHBITS];
195static struct hlist_head dev_index_head[1<<NETDEV_HASHBITS];
196
197static inline struct hlist_head *dev_name_hash(const char *name)
198{
199 unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
200 return &dev_name_head[hash & ((1<<NETDEV_HASHBITS)-1)];
201}
202
203static inline struct hlist_head *dev_index_hash(int ifindex)
204{
205 return &dev_index_head[ifindex & ((1<<NETDEV_HASHBITS)-1)];
206}
207
208/*
209 * Our notifier list
210 */
211
212static struct notifier_block *netdev_chain;
213
214/*
215 * Device drivers call our routines to queue packets here. We empty the
216 * queue in the local softnet handler.
217 */
218DEFINE_PER_CPU(struct softnet_data, softnet_data) = { 0, };
219
220#ifdef CONFIG_SYSFS
221extern int netdev_sysfs_init(void);
222extern int netdev_register_sysfs(struct net_device *);
223extern void netdev_unregister_sysfs(struct net_device *);
224#else
225#define netdev_sysfs_init() (0)
226#define netdev_register_sysfs(dev) (0)
227#define netdev_unregister_sysfs(dev) do { } while(0)
228#endif
229
230
231/*******************************************************************************
232
233 Protocol management and registration routines
234
235*******************************************************************************/
236
237/*
238 * For efficiency
239 */
240
241int netdev_nit;
242
243/*
244 * Add a protocol ID to the list. Now that the input handler is
245 * smarter we can dispense with all the messy stuff that used to be
246 * here.
247 *
248 * BEWARE!!! Protocol handlers, mangling input packets,
249 * MUST BE last in hash buckets and checking protocol handlers
250 * MUST start from promiscuous ptype_all chain in net_bh.
251 * It is true now, do not change it.
252 * Explanation follows: if protocol handler, mangling packet, will
253 * be the first on list, it is not able to sense, that packet
254 * is cloned and should be copied-on-write, so that it will
255 * change it and subsequent readers will get broken packet.
256 * --ANK (980803)
257 */
258
259/**
260 * dev_add_pack - add packet handler
261 * @pt: packet type declaration
262 *
263 * Add a protocol handler to the networking stack. The passed &packet_type
264 * is linked into kernel lists and may not be freed until it has been
265 * removed from the kernel lists.
266 *
267 * This call does not sleep therefore it can not
268 * guarantee all CPU's that are in middle of receiving packets
269 * will see the new packet type (until the next received packet).
270 */
271
272void dev_add_pack(struct packet_type *pt)
273{
274 int hash;
275
276 spin_lock_bh(&ptype_lock);
277 if (pt->type == htons(ETH_P_ALL)) {
278 netdev_nit++;
279 list_add_rcu(&pt->list, &ptype_all);
280 } else {
281 hash = ntohs(pt->type) & 15;
282 list_add_rcu(&pt->list, &ptype_base[hash]);
283 }
284 spin_unlock_bh(&ptype_lock);
285}
286
287extern void linkwatch_run_queue(void);
288
289
290
291/**
292 * __dev_remove_pack - remove packet handler
293 * @pt: packet type declaration
294 *
295 * Remove a protocol handler that was previously added to the kernel
296 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
297 * from the kernel lists and can be freed or reused once this function
298 * returns.
299 *
300 * The packet type might still be in use by receivers
301 * and must not be freed until after all the CPU's have gone
302 * through a quiescent state.
303 */
304void __dev_remove_pack(struct packet_type *pt)
305{
306 struct list_head *head;
307 struct packet_type *pt1;
308
309 spin_lock_bh(&ptype_lock);
310
311 if (pt->type == htons(ETH_P_ALL)) {
312 netdev_nit--;
313 head = &ptype_all;
314 } else
315 head = &ptype_base[ntohs(pt->type) & 15];
316
317 list_for_each_entry(pt1, head, list) {
318 if (pt == pt1) {
319 list_del_rcu(&pt->list);
320 goto out;
321 }
322 }
323
324 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
325out:
326 spin_unlock_bh(&ptype_lock);
327}
328/**
329 * dev_remove_pack - remove packet handler
330 * @pt: packet type declaration
331 *
332 * Remove a protocol handler that was previously added to the kernel
333 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
334 * from the kernel lists and can be freed or reused once this function
335 * returns.
336 *
337 * This call sleeps to guarantee that no CPU is looking at the packet
338 * type after return.
339 */
340void dev_remove_pack(struct packet_type *pt)
341{
342 __dev_remove_pack(pt);
343
344 synchronize_net();
345}
346
347/******************************************************************************
348
349 Device Boot-time Settings Routines
350
351*******************************************************************************/
352
353/* Boot time configuration table */
354static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
355
356/**
357 * netdev_boot_setup_add - add new setup entry
358 * @name: name of the device
359 * @map: configured settings for the device
360 *
361 * Adds new setup entry to the dev_boot_setup list. The function
362 * returns 0 on error and 1 on success. This is a generic routine to
363 * all netdevices.
364 */
365static int netdev_boot_setup_add(char *name, struct ifmap *map)
366{
367 struct netdev_boot_setup *s;
368 int i;
369
370 s = dev_boot_setup;
371 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
372 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
373 memset(s[i].name, 0, sizeof(s[i].name));
374 strcpy(s[i].name, name);
375 memcpy(&s[i].map, map, sizeof(s[i].map));
376 break;
377 }
378 }
379
380 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
381}
382
383/**
384 * netdev_boot_setup_check - check boot time settings
385 * @dev: the netdevice
386 *
387 * Check boot time settings for the device.
388 * The found settings are set for the device to be used
389 * later in the device probing.
390 * Returns 0 if no settings found, 1 if they are.
391 */
392int netdev_boot_setup_check(struct net_device *dev)
393{
394 struct netdev_boot_setup *s = dev_boot_setup;
395 int i;
396
397 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
398 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
399 !strncmp(dev->name, s[i].name, strlen(s[i].name))) {
400 dev->irq = s[i].map.irq;
401 dev->base_addr = s[i].map.base_addr;
402 dev->mem_start = s[i].map.mem_start;
403 dev->mem_end = s[i].map.mem_end;
404 return 1;
405 }
406 }
407 return 0;
408}
409
410
411/**
412 * netdev_boot_base - get address from boot time settings
413 * @prefix: prefix for network device
414 * @unit: id for network device
415 *
416 * Check boot time settings for the base address of device.
417 * The found settings are set for the device to be used
418 * later in the device probing.
419 * Returns 0 if no settings found.
420 */
421unsigned long netdev_boot_base(const char *prefix, int unit)
422{
423 const struct netdev_boot_setup *s = dev_boot_setup;
424 char name[IFNAMSIZ];
425 int i;
426
427 sprintf(name, "%s%d", prefix, unit);
428
429 /*
430 * If device already registered then return base of 1
431 * to indicate not to probe for this interface
432 */
433 if (__dev_get_by_name(name))
434 return 1;
435
436 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
437 if (!strcmp(name, s[i].name))
438 return s[i].map.base_addr;
439 return 0;
440}
441
442/*
443 * Saves at boot time configured settings for any netdevice.
444 */
445int __init netdev_boot_setup(char *str)
446{
447 int ints[5];
448 struct ifmap map;
449
450 str = get_options(str, ARRAY_SIZE(ints), ints);
451 if (!str || !*str)
452 return 0;
453
454 /* Save settings */
455 memset(&map, 0, sizeof(map));
456 if (ints[0] > 0)
457 map.irq = ints[1];
458 if (ints[0] > 1)
459 map.base_addr = ints[2];
460 if (ints[0] > 2)
461 map.mem_start = ints[3];
462 if (ints[0] > 3)
463 map.mem_end = ints[4];
464
465 /* Add new entry to the list */
466 return netdev_boot_setup_add(str, &map);
467}
468
469__setup("netdev=", netdev_boot_setup);
470
471/*******************************************************************************
472
473 Device Interface Subroutines
474
475*******************************************************************************/
476
477/**
478 * __dev_get_by_name - find a device by its name
479 * @name: name to find
480 *
481 * Find an interface by name. Must be called under RTNL semaphore
482 * or @dev_base_lock. If the name is found a pointer to the device
483 * is returned. If the name is not found then %NULL is returned. The
484 * reference counters are not incremented so the caller must be
485 * careful with locks.
486 */
487
488struct net_device *__dev_get_by_name(const char *name)
489{
490 struct hlist_node *p;
491
492 hlist_for_each(p, dev_name_hash(name)) {
493 struct net_device *dev
494 = hlist_entry(p, struct net_device, name_hlist);
495 if (!strncmp(dev->name, name, IFNAMSIZ))
496 return dev;
497 }
498 return NULL;
499}
500
501/**
502 * dev_get_by_name - find a device by its name
503 * @name: name to find
504 *
505 * Find an interface by name. This can be called from any
506 * context and does its own locking. The returned handle has
507 * the usage count incremented and the caller must use dev_put() to
508 * release it when it is no longer needed. %NULL is returned if no
509 * matching device is found.
510 */
511
512struct net_device *dev_get_by_name(const char *name)
513{
514 struct net_device *dev;
515
516 read_lock(&dev_base_lock);
517 dev = __dev_get_by_name(name);
518 if (dev)
519 dev_hold(dev);
520 read_unlock(&dev_base_lock);
521 return dev;
522}
523
524/**
525 * __dev_get_by_index - find a device by its ifindex
526 * @ifindex: index of device
527 *
528 * Search for an interface by index. Returns %NULL if the device
529 * is not found or a pointer to the device. The device has not
530 * had its reference counter increased so the caller must be careful
531 * about locking. The caller must hold either the RTNL semaphore
532 * or @dev_base_lock.
533 */
534
535struct net_device *__dev_get_by_index(int ifindex)
536{
537 struct hlist_node *p;
538
539 hlist_for_each(p, dev_index_hash(ifindex)) {
540 struct net_device *dev
541 = hlist_entry(p, struct net_device, index_hlist);
542 if (dev->ifindex == ifindex)
543 return dev;
544 }
545 return NULL;
546}
547
548
549/**
550 * dev_get_by_index - find a device by its ifindex
551 * @ifindex: index of device
552 *
553 * Search for an interface by index. Returns NULL if the device
554 * is not found or a pointer to the device. The device returned has
555 * had a reference added and the pointer is safe until the user calls
556 * dev_put to indicate they have finished with it.
557 */
558
559struct net_device *dev_get_by_index(int ifindex)
560{
561 struct net_device *dev;
562
563 read_lock(&dev_base_lock);
564 dev = __dev_get_by_index(ifindex);
565 if (dev)
566 dev_hold(dev);
567 read_unlock(&dev_base_lock);
568 return dev;
569}
570
571/**
572 * dev_getbyhwaddr - find a device by its hardware address
573 * @type: media type of device
574 * @ha: hardware address
575 *
576 * Search for an interface by MAC address. Returns NULL if the device
577 * is not found or a pointer to the device. The caller must hold the
578 * rtnl semaphore. The returned device has not had its ref count increased
579 * and the caller must therefore be careful about locking
580 *
581 * BUGS:
582 * If the API was consistent this would be __dev_get_by_hwaddr
583 */
584
585struct net_device *dev_getbyhwaddr(unsigned short type, char *ha)
586{
587 struct net_device *dev;
588
589 ASSERT_RTNL();
590
591 for (dev = dev_base; dev; dev = dev->next)
592 if (dev->type == type &&
593 !memcmp(dev->dev_addr, ha, dev->addr_len))
594 break;
595 return dev;
596}
597
598struct net_device *dev_getfirstbyhwtype(unsigned short type)
599{
600 struct net_device *dev;
601
602 rtnl_lock();
603 for (dev = dev_base; dev; dev = dev->next) {
604 if (dev->type == type) {
605 dev_hold(dev);
606 break;
607 }
608 }
609 rtnl_unlock();
610 return dev;
611}
612
613EXPORT_SYMBOL(dev_getfirstbyhwtype);
614
615/**
616 * dev_get_by_flags - find any device with given flags
617 * @if_flags: IFF_* values
618 * @mask: bitmask of bits in if_flags to check
619 *
620 * Search for any interface with the given flags. Returns NULL if a device
621 * is not found or a pointer to the device. The device returned has
622 * had a reference added and the pointer is safe until the user calls
623 * dev_put to indicate they have finished with it.
624 */
625
626struct net_device * dev_get_by_flags(unsigned short if_flags, unsigned short mask)
627{
628 struct net_device *dev;
629
630 read_lock(&dev_base_lock);
631 for (dev = dev_base; dev != NULL; dev = dev->next) {
632 if (((dev->flags ^ if_flags) & mask) == 0) {
633 dev_hold(dev);
634 break;
635 }
636 }
637 read_unlock(&dev_base_lock);
638 return dev;
639}
640
641/**
642 * dev_valid_name - check if name is okay for network device
643 * @name: name string
644 *
645 * Network device names need to be valid file names to
646 * to allow sysfs to work
647 */
648static int dev_valid_name(const char *name)
649{
650 return !(*name == '\0'
651 || !strcmp(name, ".")
652 || !strcmp(name, "..")
653 || strchr(name, '/'));
654}
655
656/**
657 * dev_alloc_name - allocate a name for a device
658 * @dev: device
659 * @name: name format string
660 *
661 * Passed a format string - eg "lt%d" it will try and find a suitable
662 * id. Not efficient for many devices, not called a lot. The caller
663 * must hold the dev_base or rtnl lock while allocating the name and
664 * adding the device in order to avoid duplicates. Returns the number
665 * of the unit assigned or a negative errno code.
666 */
667
668int dev_alloc_name(struct net_device *dev, const char *name)
669{
670 int i = 0;
671 char buf[IFNAMSIZ];
672 const char *p;
673 const int max_netdevices = 8*PAGE_SIZE;
674 long *inuse;
675 struct net_device *d;
676
677 p = strnchr(name, IFNAMSIZ-1, '%');
678 if (p) {
679 /*
680 * Verify the string as this thing may have come from
681 * the user. There must be either one "%d" and no other "%"
682 * characters.
683 */
684 if (p[1] != 'd' || strchr(p + 2, '%'))
685 return -EINVAL;
686
687 /* Use one page as a bit array of possible slots */
688 inuse = (long *) get_zeroed_page(GFP_ATOMIC);
689 if (!inuse)
690 return -ENOMEM;
691
692 for (d = dev_base; d; d = d->next) {
693 if (!sscanf(d->name, name, &i))
694 continue;
695 if (i < 0 || i >= max_netdevices)
696 continue;
697
698 /* avoid cases where sscanf is not exact inverse of printf */
699 snprintf(buf, sizeof(buf), name, i);
700 if (!strncmp(buf, d->name, IFNAMSIZ))
701 set_bit(i, inuse);
702 }
703
704 i = find_first_zero_bit(inuse, max_netdevices);
705 free_page((unsigned long) inuse);
706 }
707
708 snprintf(buf, sizeof(buf), name, i);
709 if (!__dev_get_by_name(buf)) {
710 strlcpy(dev->name, buf, IFNAMSIZ);
711 return i;
712 }
713
714 /* It is possible to run out of possible slots
715 * when the name is long and there isn't enough space left
716 * for the digits, or if all bits are used.
717 */
718 return -ENFILE;
719}
720
721
722/**
723 * dev_change_name - change name of a device
724 * @dev: device
725 * @newname: name (or format string) must be at least IFNAMSIZ
726 *
727 * Change name of a device, can pass format strings "eth%d".
728 * for wildcarding.
729 */
730int dev_change_name(struct net_device *dev, char *newname)
731{
732 int err = 0;
733
734 ASSERT_RTNL();
735
736 if (dev->flags & IFF_UP)
737 return -EBUSY;
738
739 if (!dev_valid_name(newname))
740 return -EINVAL;
741
742 if (strchr(newname, '%')) {
743 err = dev_alloc_name(dev, newname);
744 if (err < 0)
745 return err;
746 strcpy(newname, dev->name);
747 }
748 else if (__dev_get_by_name(newname))
749 return -EEXIST;
750 else
751 strlcpy(dev->name, newname, IFNAMSIZ);
752
753 err = class_device_rename(&dev->class_dev, dev->name);
754 if (!err) {
755 hlist_del(&dev->name_hlist);
756 hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name));
757 notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev);
758 }
759
760 return err;
761}
762
763/**
764 * netdev_state_change - device changes state
765 * @dev: device to cause notification
766 *
767 * Called to indicate a device has changed state. This function calls
768 * the notifier chains for netdev_chain and sends a NEWLINK message
769 * to the routing socket.
770 */
771void netdev_state_change(struct net_device *dev)
772{
773 if (dev->flags & IFF_UP) {
774 notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
775 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
776 }
777}
778
779/**
780 * dev_load - load a network module
781 * @name: name of interface
782 *
783 * If a network interface is not present and the process has suitable
784 * privileges this function loads the module. If module loading is not
785 * available in this kernel then it becomes a nop.
786 */
787
788void dev_load(const char *name)
789{
790 struct net_device *dev;
791
792 read_lock(&dev_base_lock);
793 dev = __dev_get_by_name(name);
794 read_unlock(&dev_base_lock);
795
796 if (!dev && capable(CAP_SYS_MODULE))
797 request_module("%s", name);
798}
799
800static int default_rebuild_header(struct sk_buff *skb)
801{
802 printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n",
803 skb->dev ? skb->dev->name : "NULL!!!");
804 kfree_skb(skb);
805 return 1;
806}
807
808
809/**
810 * dev_open - prepare an interface for use.
811 * @dev: device to open
812 *
813 * Takes a device from down to up state. The device's private open
814 * function is invoked and then the multicast lists are loaded. Finally
815 * the device is moved into the up state and a %NETDEV_UP message is
816 * sent to the netdev notifier chain.
817 *
818 * Calling this function on an active interface is a nop. On a failure
819 * a negative errno code is returned.
820 */
821int dev_open(struct net_device *dev)
822{
823 int ret = 0;
824
825 /*
826 * Is it already up?
827 */
828
829 if (dev->flags & IFF_UP)
830 return 0;
831
832 /*
833 * Is it even present?
834 */
835 if (!netif_device_present(dev))
836 return -ENODEV;
837
838 /*
839 * Call device private open method
840 */
841 set_bit(__LINK_STATE_START, &dev->state);
842 if (dev->open) {
843 ret = dev->open(dev);
844 if (ret)
845 clear_bit(__LINK_STATE_START, &dev->state);
846 }
847
848 /*
849 * If it went open OK then:
850 */
851
852 if (!ret) {
853 /*
854 * Set the flags.
855 */
856 dev->flags |= IFF_UP;
857
858 /*
859 * Initialize multicasting status
860 */
861 dev_mc_upload(dev);
862
863 /*
864 * Wakeup transmit queue engine
865 */
866 dev_activate(dev);
867
868 /*
869 * ... and announce new interface.
870 */
871 notifier_call_chain(&netdev_chain, NETDEV_UP, dev);
872 }
873 return ret;
874}
875
876/**
877 * dev_close - shutdown an interface.
878 * @dev: device to shutdown
879 *
880 * This function moves an active device into down state. A
881 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
882 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
883 * chain.
884 */
885int dev_close(struct net_device *dev)
886{
887 if (!(dev->flags & IFF_UP))
888 return 0;
889
890 /*
891 * Tell people we are going down, so that they can
892 * prepare to death, when device is still operating.
893 */
894 notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev);
895
896 dev_deactivate(dev);
897
898 clear_bit(__LINK_STATE_START, &dev->state);
899
900 /* Synchronize to scheduled poll. We cannot touch poll list,
901 * it can be even on different cpu. So just clear netif_running(),
902 * and wait when poll really will happen. Actually, the best place
903 * for this is inside dev->stop() after device stopped its irq
904 * engine, but this requires more changes in devices. */
905
906 smp_mb__after_clear_bit(); /* Commit netif_running(). */
907 while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) {
908 /* No hurry. */
909 current->state = TASK_INTERRUPTIBLE;
910 schedule_timeout(1);
911 }
912
913 /*
914 * Call the device specific close. This cannot fail.
915 * Only if device is UP
916 *
917 * We allow it to be called even after a DETACH hot-plug
918 * event.
919 */
920 if (dev->stop)
921 dev->stop(dev);
922
923 /*
924 * Device is now down.
925 */
926
927 dev->flags &= ~IFF_UP;
928
929 /*
930 * Tell people we are down
931 */
932 notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev);
933
934 return 0;
935}
936
937
938/*
939 * Device change register/unregister. These are not inline or static
940 * as we export them to the world.
941 */
942
943/**
944 * register_netdevice_notifier - register a network notifier block
945 * @nb: notifier
946 *
947 * Register a notifier to be called when network device events occur.
948 * The notifier passed is linked into the kernel structures and must
949 * not be reused until it has been unregistered. A negative errno code
950 * is returned on a failure.
951 *
952 * When registered all registration and up events are replayed
953 * to the new notifier to allow device to have a race free
954 * view of the network device list.
955 */
956
957int register_netdevice_notifier(struct notifier_block *nb)
958{
959 struct net_device *dev;
960 int err;
961
962 rtnl_lock();
963 err = notifier_chain_register(&netdev_chain, nb);
964 if (!err) {
965 for (dev = dev_base; dev; dev = dev->next) {
966 nb->notifier_call(nb, NETDEV_REGISTER, dev);
967
968 if (dev->flags & IFF_UP)
969 nb->notifier_call(nb, NETDEV_UP, dev);
970 }
971 }
972 rtnl_unlock();
973 return err;
974}
975
976/**
977 * unregister_netdevice_notifier - unregister a network notifier block
978 * @nb: notifier
979 *
980 * Unregister a notifier previously registered by
981 * register_netdevice_notifier(). The notifier is unlinked into the
982 * kernel structures and may then be reused. A negative errno code
983 * is returned on a failure.
984 */
985
986int unregister_netdevice_notifier(struct notifier_block *nb)
987{
988 return notifier_chain_unregister(&netdev_chain, nb);
989}
990
991/**
992 * call_netdevice_notifiers - call all network notifier blocks
993 * @val: value passed unmodified to notifier function
994 * @v: pointer passed unmodified to notifier function
995 *
996 * Call all network notifier blocks. Parameters and return value
997 * are as for notifier_call_chain().
998 */
999
1000int call_netdevice_notifiers(unsigned long val, void *v)
1001{
1002 return notifier_call_chain(&netdev_chain, val, v);
1003}
1004
1005/* When > 0 there are consumers of rx skb time stamps */
1006static atomic_t netstamp_needed = ATOMIC_INIT(0);
1007
1008void net_enable_timestamp(void)
1009{
1010 atomic_inc(&netstamp_needed);
1011}
1012
1013void net_disable_timestamp(void)
1014{
1015 atomic_dec(&netstamp_needed);
1016}
1017
1018static inline void net_timestamp(struct timeval *stamp)
1019{
1020 if (atomic_read(&netstamp_needed))
1021 do_gettimeofday(stamp);
1022 else {
1023 stamp->tv_sec = 0;
1024 stamp->tv_usec = 0;
1025 }
1026}
1027
1028/*
1029 * Support routine. Sends outgoing frames to any network
1030 * taps currently in use.
1031 */
1032
1033void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1034{
1035 struct packet_type *ptype;
1036 net_timestamp(&skb->stamp);
1037
1038 rcu_read_lock();
1039 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1040 /* Never send packets back to the socket
1041 * they originated from - MvS (miquels@drinkel.ow.org)
1042 */
1043 if ((ptype->dev == dev || !ptype->dev) &&
1044 (ptype->af_packet_priv == NULL ||
1045 (struct sock *)ptype->af_packet_priv != skb->sk)) {
1046 struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1047 if (!skb2)
1048 break;
1049
1050 /* skb->nh should be correctly
1051 set by sender, so that the second statement is
1052 just protection against buggy protocols.
1053 */
1054 skb2->mac.raw = skb2->data;
1055
1056 if (skb2->nh.raw < skb2->data ||
1057 skb2->nh.raw > skb2->tail) {
1058 if (net_ratelimit())
1059 printk(KERN_CRIT "protocol %04x is "
1060 "buggy, dev %s\n",
1061 skb2->protocol, dev->name);
1062 skb2->nh.raw = skb2->data;
1063 }
1064
1065 skb2->h.raw = skb2->nh.raw;
1066 skb2->pkt_type = PACKET_OUTGOING;
1067 ptype->func(skb2, skb->dev, ptype);
1068 }
1069 }
1070 rcu_read_unlock();
1071}
1072
1073/*
1074 * Invalidate hardware checksum when packet is to be mangled, and
1075 * complete checksum manually on outgoing path.
1076 */
1077int skb_checksum_help(struct sk_buff *skb, int inward)
1078{
1079 unsigned int csum;
1080 int ret = 0, offset = skb->h.raw - skb->data;
1081
1082 if (inward) {
1083 skb->ip_summed = CHECKSUM_NONE;
1084 goto out;
1085 }
1086
1087 if (skb_cloned(skb)) {
1088 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1089 if (ret)
1090 goto out;
1091 }
1092
1093 if (offset > (int)skb->len)
1094 BUG();
1095 csum = skb_checksum(skb, offset, skb->len-offset, 0);
1096
1097 offset = skb->tail - skb->h.raw;
1098 if (offset <= 0)
1099 BUG();
1100 if (skb->csum + 2 > offset)
1101 BUG();
1102
1103 *(u16*)(skb->h.raw + skb->csum) = csum_fold(csum);
1104 skb->ip_summed = CHECKSUM_NONE;
1105out:
1106 return ret;
1107}
1108
1109#ifdef CONFIG_HIGHMEM
1110/* Actually, we should eliminate this check as soon as we know, that:
1111 * 1. IOMMU is present and allows to map all the memory.
1112 * 2. No high memory really exists on this machine.
1113 */
1114
1115static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1116{
1117 int i;
1118
1119 if (dev->features & NETIF_F_HIGHDMA)
1120 return 0;
1121
1122 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1123 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1124 return 1;
1125
1126 return 0;
1127}
1128#else
1129#define illegal_highdma(dev, skb) (0)
1130#endif
1131
1132extern void skb_release_data(struct sk_buff *);
1133
1134/* Keep head the same: replace data */
1135int __skb_linearize(struct sk_buff *skb, int gfp_mask)
1136{
1137 unsigned int size;
1138 u8 *data;
1139 long offset;
1140 struct skb_shared_info *ninfo;
1141 int headerlen = skb->data - skb->head;
1142 int expand = (skb->tail + skb->data_len) - skb->end;
1143
1144 if (skb_shared(skb))
1145 BUG();
1146
1147 if (expand <= 0)
1148 expand = 0;
1149
1150 size = skb->end - skb->head + expand;
1151 size = SKB_DATA_ALIGN(size);
1152 data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
1153 if (!data)
1154 return -ENOMEM;
1155
1156 /* Copy entire thing */
1157 if (skb_copy_bits(skb, -headerlen, data, headerlen + skb->len))
1158 BUG();
1159
1160 /* Set up shinfo */
1161 ninfo = (struct skb_shared_info*)(data + size);
1162 atomic_set(&ninfo->dataref, 1);
1163 ninfo->tso_size = skb_shinfo(skb)->tso_size;
1164 ninfo->tso_segs = skb_shinfo(skb)->tso_segs;
1165 ninfo->nr_frags = 0;
1166 ninfo->frag_list = NULL;
1167
1168 /* Offset between the two in bytes */
1169 offset = data - skb->head;
1170
1171 /* Free old data. */
1172 skb_release_data(skb);
1173
1174 skb->head = data;
1175 skb->end = data + size;
1176
1177 /* Set up new pointers */
1178 skb->h.raw += offset;
1179 skb->nh.raw += offset;
1180 skb->mac.raw += offset;
1181 skb->tail += offset;
1182 skb->data += offset;
1183
1184 /* We are no longer a clone, even if we were. */
1185 skb->cloned = 0;
1186
1187 skb->tail += skb->data_len;
1188 skb->data_len = 0;
1189 return 0;
1190}
1191
1192#define HARD_TX_LOCK(dev, cpu) { \
1193 if ((dev->features & NETIF_F_LLTX) == 0) { \
1194 spin_lock(&dev->xmit_lock); \
1195 dev->xmit_lock_owner = cpu; \
1196 } \
1197}
1198
1199#define HARD_TX_UNLOCK(dev) { \
1200 if ((dev->features & NETIF_F_LLTX) == 0) { \
1201 dev->xmit_lock_owner = -1; \
1202 spin_unlock(&dev->xmit_lock); \
1203 } \
1204}
1205
1206/**
1207 * dev_queue_xmit - transmit a buffer
1208 * @skb: buffer to transmit
1209 *
1210 * Queue a buffer for transmission to a network device. The caller must
1211 * have set the device and priority and built the buffer before calling
1212 * this function. The function can be called from an interrupt.
1213 *
1214 * A negative errno code is returned on a failure. A success does not
1215 * guarantee the frame will be transmitted as it may be dropped due
1216 * to congestion or traffic shaping.
1217 */
1218
1219int dev_queue_xmit(struct sk_buff *skb)
1220{
1221 struct net_device *dev = skb->dev;
1222 struct Qdisc *q;
1223 int rc = -ENOMEM;
1224
1225 if (skb_shinfo(skb)->frag_list &&
1226 !(dev->features & NETIF_F_FRAGLIST) &&
1227 __skb_linearize(skb, GFP_ATOMIC))
1228 goto out_kfree_skb;
1229
1230 /* Fragmented skb is linearized if device does not support SG,
1231 * or if at least one of fragments is in highmem and device
1232 * does not support DMA from it.
1233 */
1234 if (skb_shinfo(skb)->nr_frags &&
1235 (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1236 __skb_linearize(skb, GFP_ATOMIC))
1237 goto out_kfree_skb;
1238
1239 /* If packet is not checksummed and device does not support
1240 * checksumming for this protocol, complete checksumming here.
1241 */
1242 if (skb->ip_summed == CHECKSUM_HW &&
1243 (!(dev->features & (NETIF_F_HW_CSUM | NETIF_F_NO_CSUM)) &&
1244 (!(dev->features & NETIF_F_IP_CSUM) ||
1245 skb->protocol != htons(ETH_P_IP))))
1246 if (skb_checksum_help(skb, 0))
1247 goto out_kfree_skb;
1248
1249 /* Disable soft irqs for various locks below. Also
1250 * stops preemption for RCU.
1251 */
1252 local_bh_disable();
1253
1254 /* Updates of qdisc are serialized by queue_lock.
1255 * The struct Qdisc which is pointed to by qdisc is now a
1256 * rcu structure - it may be accessed without acquiring
1257 * a lock (but the structure may be stale.) The freeing of the
1258 * qdisc will be deferred until it's known that there are no
1259 * more references to it.
1260 *
1261 * If the qdisc has an enqueue function, we still need to
1262 * hold the queue_lock before calling it, since queue_lock
1263 * also serializes access to the device queue.
1264 */
1265
1266 q = rcu_dereference(dev->qdisc);
1267#ifdef CONFIG_NET_CLS_ACT
1268 skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1269#endif
1270 if (q->enqueue) {
1271 /* Grab device queue */
1272 spin_lock(&dev->queue_lock);
1273
1274 rc = q->enqueue(skb, q);
1275
1276 qdisc_run(dev);
1277
1278 spin_unlock(&dev->queue_lock);
1279 rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
1280 goto out;
1281 }
1282
1283 /* The device has no queue. Common case for software devices:
1284 loopback, all the sorts of tunnels...
1285
1286 Really, it is unlikely that xmit_lock protection is necessary here.
1287 (f.e. loopback and IP tunnels are clean ignoring statistics
1288 counters.)
1289 However, it is possible, that they rely on protection
1290 made by us here.
1291
1292 Check this and shot the lock. It is not prone from deadlocks.
1293 Either shot noqueue qdisc, it is even simpler 8)
1294 */
1295 if (dev->flags & IFF_UP) {
1296 int cpu = smp_processor_id(); /* ok because BHs are off */
1297
1298 if (dev->xmit_lock_owner != cpu) {
1299
1300 HARD_TX_LOCK(dev, cpu);
1301
1302 if (!netif_queue_stopped(dev)) {
1303 if (netdev_nit)
1304 dev_queue_xmit_nit(skb, dev);
1305
1306 rc = 0;
1307 if (!dev->hard_start_xmit(skb, dev)) {
1308 HARD_TX_UNLOCK(dev);
1309 goto out;
1310 }
1311 }
1312 HARD_TX_UNLOCK(dev);
1313 if (net_ratelimit())
1314 printk(KERN_CRIT "Virtual device %s asks to "
1315 "queue packet!\n", dev->name);
1316 } else {
1317 /* Recursion is detected! It is possible,
1318 * unfortunately */
1319 if (net_ratelimit())
1320 printk(KERN_CRIT "Dead loop on virtual device "
1321 "%s, fix it urgently!\n", dev->name);
1322 }
1323 }
1324
1325 rc = -ENETDOWN;
1326 local_bh_enable();
1327
1328out_kfree_skb:
1329 kfree_skb(skb);
1330 return rc;
1331out:
1332 local_bh_enable();
1333 return rc;
1334}
1335
1336
1337/*=======================================================================
1338 Receiver routines
1339 =======================================================================*/
1340
1341int netdev_max_backlog = 300;
1342int weight_p = 64; /* old backlog weight */
1343/* These numbers are selected based on intuition and some
1344 * experimentatiom, if you have more scientific way of doing this
1345 * please go ahead and fix things.
1346 */
1347int no_cong_thresh = 10;
1348int no_cong = 20;
1349int lo_cong = 100;
1350int mod_cong = 290;
1351
1352DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1353
1354
1355static void get_sample_stats(int cpu)
1356{
1357#ifdef RAND_LIE
1358 unsigned long rd;
1359 int rq;
1360#endif
1361 struct softnet_data *sd = &per_cpu(softnet_data, cpu);
1362 int blog = sd->input_pkt_queue.qlen;
1363 int avg_blog = sd->avg_blog;
1364
1365 avg_blog = (avg_blog >> 1) + (blog >> 1);
1366
1367 if (avg_blog > mod_cong) {
1368 /* Above moderate congestion levels. */
1369 sd->cng_level = NET_RX_CN_HIGH;
1370#ifdef RAND_LIE
1371 rd = net_random();
1372 rq = rd % netdev_max_backlog;
1373 if (rq < avg_blog) /* unlucky bastard */
1374 sd->cng_level = NET_RX_DROP;
1375#endif
1376 } else if (avg_blog > lo_cong) {
1377 sd->cng_level = NET_RX_CN_MOD;
1378#ifdef RAND_LIE
1379 rd = net_random();
1380 rq = rd % netdev_max_backlog;
1381 if (rq < avg_blog) /* unlucky bastard */
1382 sd->cng_level = NET_RX_CN_HIGH;
1383#endif
1384 } else if (avg_blog > no_cong)
1385 sd->cng_level = NET_RX_CN_LOW;
1386 else /* no congestion */
1387 sd->cng_level = NET_RX_SUCCESS;
1388
1389 sd->avg_blog = avg_blog;
1390}
1391
1392#ifdef OFFLINE_SAMPLE
1393static void sample_queue(unsigned long dummy)
1394{
1395/* 10 ms 0r 1ms -- i don't care -- JHS */
1396 int next_tick = 1;
1397 int cpu = smp_processor_id();
1398
1399 get_sample_stats(cpu);
1400 next_tick += jiffies;
1401 mod_timer(&samp_timer, next_tick);
1402}
1403#endif
1404
1405
1406/**
1407 * netif_rx - post buffer to the network code
1408 * @skb: buffer to post
1409 *
1410 * This function receives a packet from a device driver and queues it for
1411 * the upper (protocol) levels to process. It always succeeds. The buffer
1412 * may be dropped during processing for congestion control or by the
1413 * protocol layers.
1414 *
1415 * return values:
1416 * NET_RX_SUCCESS (no congestion)
1417 * NET_RX_CN_LOW (low congestion)
1418 * NET_RX_CN_MOD (moderate congestion)
1419 * NET_RX_CN_HIGH (high congestion)
1420 * NET_RX_DROP (packet was dropped)
1421 *
1422 */
1423
1424int netif_rx(struct sk_buff *skb)
1425{
1426 int this_cpu;
1427 struct softnet_data *queue;
1428 unsigned long flags;
1429
1430 /* if netpoll wants it, pretend we never saw it */
1431 if (netpoll_rx(skb))
1432 return NET_RX_DROP;
1433
1434 if (!skb->stamp.tv_sec)
1435 net_timestamp(&skb->stamp);
1436
1437 /*
1438 * The code is rearranged so that the path is the most
1439 * short when CPU is congested, but is still operating.
1440 */
1441 local_irq_save(flags);
1442 this_cpu = smp_processor_id();
1443 queue = &__get_cpu_var(softnet_data);
1444
1445 __get_cpu_var(netdev_rx_stat).total++;
1446 if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1447 if (queue->input_pkt_queue.qlen) {
1448 if (queue->throttle)
1449 goto drop;
1450
1451enqueue:
1452 dev_hold(skb->dev);
1453 __skb_queue_tail(&queue->input_pkt_queue, skb);
1454#ifndef OFFLINE_SAMPLE
1455 get_sample_stats(this_cpu);
1456#endif
1457 local_irq_restore(flags);
1458 return queue->cng_level;
1459 }
1460
1461 if (queue->throttle)
1462 queue->throttle = 0;
1463
1464 netif_rx_schedule(&queue->backlog_dev);
1465 goto enqueue;
1466 }
1467
1468 if (!queue->throttle) {
1469 queue->throttle = 1;
1470 __get_cpu_var(netdev_rx_stat).throttled++;
1471 }
1472
1473drop:
1474 __get_cpu_var(netdev_rx_stat).dropped++;
1475 local_irq_restore(flags);
1476
1477 kfree_skb(skb);
1478 return NET_RX_DROP;
1479}
1480
1481int netif_rx_ni(struct sk_buff *skb)
1482{
1483 int err;
1484
1485 preempt_disable();
1486 err = netif_rx(skb);
1487 if (local_softirq_pending())
1488 do_softirq();
1489 preempt_enable();
1490
1491 return err;
1492}
1493
1494EXPORT_SYMBOL(netif_rx_ni);
1495
1496static __inline__ void skb_bond(struct sk_buff *skb)
1497{
1498 struct net_device *dev = skb->dev;
1499
1500 if (dev->master) {
1501 skb->real_dev = skb->dev;
1502 skb->dev = dev->master;
1503 }
1504}
1505
1506static void net_tx_action(struct softirq_action *h)
1507{
1508 struct softnet_data *sd = &__get_cpu_var(softnet_data);
1509
1510 if (sd->completion_queue) {
1511 struct sk_buff *clist;
1512
1513 local_irq_disable();
1514 clist = sd->completion_queue;
1515 sd->completion_queue = NULL;
1516 local_irq_enable();
1517
1518 while (clist) {
1519 struct sk_buff *skb = clist;
1520 clist = clist->next;
1521
1522 BUG_TRAP(!atomic_read(&skb->users));
1523 __kfree_skb(skb);
1524 }
1525 }
1526
1527 if (sd->output_queue) {
1528 struct net_device *head;
1529
1530 local_irq_disable();
1531 head = sd->output_queue;
1532 sd->output_queue = NULL;
1533 local_irq_enable();
1534
1535 while (head) {
1536 struct net_device *dev = head;
1537 head = head->next_sched;
1538
1539 smp_mb__before_clear_bit();
1540 clear_bit(__LINK_STATE_SCHED, &dev->state);
1541
1542 if (spin_trylock(&dev->queue_lock)) {
1543 qdisc_run(dev);
1544 spin_unlock(&dev->queue_lock);
1545 } else {
1546 netif_schedule(dev);
1547 }
1548 }
1549 }
1550}
1551
1552static __inline__ int deliver_skb(struct sk_buff *skb,
1553 struct packet_type *pt_prev)
1554{
1555 atomic_inc(&skb->users);
1556 return pt_prev->func(skb, skb->dev, pt_prev);
1557}
1558
1559#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
1560int (*br_handle_frame_hook)(struct net_bridge_port *p, struct sk_buff **pskb);
1561struct net_bridge;
1562struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
1563 unsigned char *addr);
1564void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent);
1565
1566static __inline__ int handle_bridge(struct sk_buff **pskb,
1567 struct packet_type **pt_prev, int *ret)
1568{
1569 struct net_bridge_port *port;
1570
1571 if ((*pskb)->pkt_type == PACKET_LOOPBACK ||
1572 (port = rcu_dereference((*pskb)->dev->br_port)) == NULL)
1573 return 0;
1574
1575 if (*pt_prev) {
1576 *ret = deliver_skb(*pskb, *pt_prev);
1577 *pt_prev = NULL;
1578 }
1579
1580 return br_handle_frame_hook(port, pskb);
1581}
1582#else
1583#define handle_bridge(skb, pt_prev, ret) (0)
1584#endif
1585
1586#ifdef CONFIG_NET_CLS_ACT
1587/* TODO: Maybe we should just force sch_ingress to be compiled in
1588 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
1589 * a compare and 2 stores extra right now if we dont have it on
1590 * but have CONFIG_NET_CLS_ACT
1591 * NOTE: This doesnt stop any functionality; if you dont have
1592 * the ingress scheduler, you just cant add policies on ingress.
1593 *
1594 */
1595static int ing_filter(struct sk_buff *skb)
1596{
1597 struct Qdisc *q;
1598 struct net_device *dev = skb->dev;
1599 int result = TC_ACT_OK;
1600
1601 if (dev->qdisc_ingress) {
1602 __u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd);
1603 if (MAX_RED_LOOP < ttl++) {
1604 printk("Redir loop detected Dropping packet (%s->%s)\n",
1605 skb->input_dev?skb->input_dev->name:"??",skb->dev->name);
1606 return TC_ACT_SHOT;
1607 }
1608
1609 skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl);
1610
1611 skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS);
1612 if (NULL == skb->input_dev) {
1613 skb->input_dev = skb->dev;
1614 printk("ing_filter: fixed %s out %s\n",skb->input_dev->name,skb->dev->name);
1615 }
1616 spin_lock(&dev->ingress_lock);
1617 if ((q = dev->qdisc_ingress) != NULL)
1618 result = q->enqueue(skb, q);
1619 spin_unlock(&dev->ingress_lock);
1620
1621 }
1622
1623 return result;
1624}
1625#endif
1626
1627int netif_receive_skb(struct sk_buff *skb)
1628{
1629 struct packet_type *ptype, *pt_prev;
1630 int ret = NET_RX_DROP;
1631 unsigned short type;
1632
1633 /* if we've gotten here through NAPI, check netpoll */
1634 if (skb->dev->poll && netpoll_rx(skb))
1635 return NET_RX_DROP;
1636
1637 if (!skb->stamp.tv_sec)
1638 net_timestamp(&skb->stamp);
1639
1640 skb_bond(skb);
1641
1642 __get_cpu_var(netdev_rx_stat).total++;
1643
1644 skb->h.raw = skb->nh.raw = skb->data;
1645 skb->mac_len = skb->nh.raw - skb->mac.raw;
1646
1647 pt_prev = NULL;
1648
1649 rcu_read_lock();
1650
1651#ifdef CONFIG_NET_CLS_ACT
1652 if (skb->tc_verd & TC_NCLS) {
1653 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
1654 goto ncls;
1655 }
1656#endif
1657
1658 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1659 if (!ptype->dev || ptype->dev == skb->dev) {
1660 if (pt_prev)
1661 ret = deliver_skb(skb, pt_prev);
1662 pt_prev = ptype;
1663 }
1664 }
1665
1666#ifdef CONFIG_NET_CLS_ACT
1667 if (pt_prev) {
1668 ret = deliver_skb(skb, pt_prev);
1669 pt_prev = NULL; /* noone else should process this after*/
1670 } else {
1671 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
1672 }
1673
1674 ret = ing_filter(skb);
1675
1676 if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) {
1677 kfree_skb(skb);
1678 goto out;
1679 }
1680
1681 skb->tc_verd = 0;
1682ncls:
1683#endif
1684
1685 handle_diverter(skb);
1686
1687 if (handle_bridge(&skb, &pt_prev, &ret))
1688 goto out;
1689
1690 type = skb->protocol;
1691 list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) {
1692 if (ptype->type == type &&
1693 (!ptype->dev || ptype->dev == skb->dev)) {
1694 if (pt_prev)
1695 ret = deliver_skb(skb, pt_prev);
1696 pt_prev = ptype;
1697 }
1698 }
1699
1700 if (pt_prev) {
1701 ret = pt_prev->func(skb, skb->dev, pt_prev);
1702 } else {
1703 kfree_skb(skb);
1704 /* Jamal, now you will not able to escape explaining
1705 * me how you were going to use this. :-)
1706 */
1707 ret = NET_RX_DROP;
1708 }
1709
1710out:
1711 rcu_read_unlock();
1712 return ret;
1713}
1714
1715static int process_backlog(struct net_device *backlog_dev, int *budget)
1716{
1717 int work = 0;
1718 int quota = min(backlog_dev->quota, *budget);
1719 struct softnet_data *queue = &__get_cpu_var(softnet_data);
1720 unsigned long start_time = jiffies;
1721
1722 for (;;) {
1723 struct sk_buff *skb;
1724 struct net_device *dev;
1725
1726 local_irq_disable();
1727 skb = __skb_dequeue(&queue->input_pkt_queue);
1728 if (!skb)
1729 goto job_done;
1730 local_irq_enable();
1731
1732 dev = skb->dev;
1733
1734 netif_receive_skb(skb);
1735
1736 dev_put(dev);
1737
1738 work++;
1739
1740 if (work >= quota || jiffies - start_time > 1)
1741 break;
1742
1743 }
1744
1745 backlog_dev->quota -= work;
1746 *budget -= work;
1747 return -1;
1748
1749job_done:
1750 backlog_dev->quota -= work;
1751 *budget -= work;
1752
1753 list_del(&backlog_dev->poll_list);
1754 smp_mb__before_clear_bit();
1755 netif_poll_enable(backlog_dev);
1756
1757 if (queue->throttle)
1758 queue->throttle = 0;
1759 local_irq_enable();
1760 return 0;
1761}
1762
1763static void net_rx_action(struct softirq_action *h)
1764{
1765 struct softnet_data *queue = &__get_cpu_var(softnet_data);
1766 unsigned long start_time = jiffies;
1767 int budget = netdev_max_backlog;
1768
1769
1770 local_irq_disable();
1771
1772 while (!list_empty(&queue->poll_list)) {
1773 struct net_device *dev;
1774
1775 if (budget <= 0 || jiffies - start_time > 1)
1776 goto softnet_break;
1777
1778 local_irq_enable();
1779
1780 dev = list_entry(queue->poll_list.next,
1781 struct net_device, poll_list);
1782 netpoll_poll_lock(dev);
1783
1784 if (dev->quota <= 0 || dev->poll(dev, &budget)) {
1785 netpoll_poll_unlock(dev);
1786 local_irq_disable();
1787 list_del(&dev->poll_list);
1788 list_add_tail(&dev->poll_list, &queue->poll_list);
1789 if (dev->quota < 0)
1790 dev->quota += dev->weight;
1791 else
1792 dev->quota = dev->weight;
1793 } else {
1794 netpoll_poll_unlock(dev);
1795 dev_put(dev);
1796 local_irq_disable();
1797 }
1798 }
1799out:
1800 local_irq_enable();
1801 return;
1802
1803softnet_break:
1804 __get_cpu_var(netdev_rx_stat).time_squeeze++;
1805 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
1806 goto out;
1807}
1808
1809static gifconf_func_t * gifconf_list [NPROTO];
1810
1811/**
1812 * register_gifconf - register a SIOCGIF handler
1813 * @family: Address family
1814 * @gifconf: Function handler
1815 *
1816 * Register protocol dependent address dumping routines. The handler
1817 * that is passed must not be freed or reused until it has been replaced
1818 * by another handler.
1819 */
1820int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
1821{
1822 if (family >= NPROTO)
1823 return -EINVAL;
1824 gifconf_list[family] = gifconf;
1825 return 0;
1826}
1827
1828
1829/*
1830 * Map an interface index to its name (SIOCGIFNAME)
1831 */
1832
1833/*
1834 * We need this ioctl for efficient implementation of the
1835 * if_indextoname() function required by the IPv6 API. Without
1836 * it, we would have to search all the interfaces to find a
1837 * match. --pb
1838 */
1839
1840static int dev_ifname(struct ifreq __user *arg)
1841{
1842 struct net_device *dev;
1843 struct ifreq ifr;
1844
1845 /*
1846 * Fetch the caller's info block.
1847 */
1848
1849 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
1850 return -EFAULT;
1851
1852 read_lock(&dev_base_lock);
1853 dev = __dev_get_by_index(ifr.ifr_ifindex);
1854 if (!dev) {
1855 read_unlock(&dev_base_lock);
1856 return -ENODEV;
1857 }
1858
1859 strcpy(ifr.ifr_name, dev->name);
1860 read_unlock(&dev_base_lock);
1861
1862 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
1863 return -EFAULT;
1864 return 0;
1865}
1866
1867/*
1868 * Perform a SIOCGIFCONF call. This structure will change
1869 * size eventually, and there is nothing I can do about it.
1870 * Thus we will need a 'compatibility mode'.
1871 */
1872
1873static int dev_ifconf(char __user *arg)
1874{
1875 struct ifconf ifc;
1876 struct net_device *dev;
1877 char __user *pos;
1878 int len;
1879 int total;
1880 int i;
1881
1882 /*
1883 * Fetch the caller's info block.
1884 */
1885
1886 if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
1887 return -EFAULT;
1888
1889 pos = ifc.ifc_buf;
1890 len = ifc.ifc_len;
1891
1892 /*
1893 * Loop over the interfaces, and write an info block for each.
1894 */
1895
1896 total = 0;
1897 for (dev = dev_base; dev; dev = dev->next) {
1898 for (i = 0; i < NPROTO; i++) {
1899 if (gifconf_list[i]) {
1900 int done;
1901 if (!pos)
1902 done = gifconf_list[i](dev, NULL, 0);
1903 else
1904 done = gifconf_list[i](dev, pos + total,
1905 len - total);
1906 if (done < 0)
1907 return -EFAULT;
1908 total += done;
1909 }
1910 }
1911 }
1912
1913 /*
1914 * All done. Write the updated control block back to the caller.
1915 */
1916 ifc.ifc_len = total;
1917
1918 /*
1919 * Both BSD and Solaris return 0 here, so we do too.
1920 */
1921 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
1922}
1923
1924#ifdef CONFIG_PROC_FS
1925/*
1926 * This is invoked by the /proc filesystem handler to display a device
1927 * in detail.
1928 */
1929static __inline__ struct net_device *dev_get_idx(loff_t pos)
1930{
1931 struct net_device *dev;
1932 loff_t i;
1933
1934 for (i = 0, dev = dev_base; dev && i < pos; ++i, dev = dev->next);
1935
1936 return i == pos ? dev : NULL;
1937}
1938
1939void *dev_seq_start(struct seq_file *seq, loff_t *pos)
1940{
1941 read_lock(&dev_base_lock);
1942 return *pos ? dev_get_idx(*pos - 1) : SEQ_START_TOKEN;
1943}
1944
1945void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1946{
1947 ++*pos;
1948 return v == SEQ_START_TOKEN ? dev_base : ((struct net_device *)v)->next;
1949}
1950
1951void dev_seq_stop(struct seq_file *seq, void *v)
1952{
1953 read_unlock(&dev_base_lock);
1954}
1955
1956static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
1957{
1958 if (dev->get_stats) {
1959 struct net_device_stats *stats = dev->get_stats(dev);
1960
1961 seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
1962 "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
1963 dev->name, stats->rx_bytes, stats->rx_packets,
1964 stats->rx_errors,
1965 stats->rx_dropped + stats->rx_missed_errors,
1966 stats->rx_fifo_errors,
1967 stats->rx_length_errors + stats->rx_over_errors +
1968 stats->rx_crc_errors + stats->rx_frame_errors,
1969 stats->rx_compressed, stats->multicast,
1970 stats->tx_bytes, stats->tx_packets,
1971 stats->tx_errors, stats->tx_dropped,
1972 stats->tx_fifo_errors, stats->collisions,
1973 stats->tx_carrier_errors +
1974 stats->tx_aborted_errors +
1975 stats->tx_window_errors +
1976 stats->tx_heartbeat_errors,
1977 stats->tx_compressed);
1978 } else
1979 seq_printf(seq, "%6s: No statistics available.\n", dev->name);
1980}
1981
1982/*
1983 * Called from the PROCfs module. This now uses the new arbitrary sized
1984 * /proc/net interface to create /proc/net/dev
1985 */
1986static int dev_seq_show(struct seq_file *seq, void *v)
1987{
1988 if (v == SEQ_START_TOKEN)
1989 seq_puts(seq, "Inter-| Receive "
1990 " | Transmit\n"
1991 " face |bytes packets errs drop fifo frame "
1992 "compressed multicast|bytes packets errs "
1993 "drop fifo colls carrier compressed\n");
1994 else
1995 dev_seq_printf_stats(seq, v);
1996 return 0;
1997}
1998
1999static struct netif_rx_stats *softnet_get_online(loff_t *pos)
2000{
2001 struct netif_rx_stats *rc = NULL;
2002
2003 while (*pos < NR_CPUS)
2004 if (cpu_online(*pos)) {
2005 rc = &per_cpu(netdev_rx_stat, *pos);
2006 break;
2007 } else
2008 ++*pos;
2009 return rc;
2010}
2011
2012static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
2013{
2014 return softnet_get_online(pos);
2015}
2016
2017static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2018{
2019 ++*pos;
2020 return softnet_get_online(pos);
2021}
2022
2023static void softnet_seq_stop(struct seq_file *seq, void *v)
2024{
2025}
2026
2027static int softnet_seq_show(struct seq_file *seq, void *v)
2028{
2029 struct netif_rx_stats *s = v;
2030
2031 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2032 s->total, s->dropped, s->time_squeeze, s->throttled,
2033 s->fastroute_hit, s->fastroute_success, s->fastroute_defer,
2034 s->fastroute_deferred_out,
2035#if 0
2036 s->fastroute_latency_reduction
2037#else
2038 s->cpu_collision
2039#endif
2040 );
2041 return 0;
2042}
2043
2044static struct seq_operations dev_seq_ops = {
2045 .start = dev_seq_start,
2046 .next = dev_seq_next,
2047 .stop = dev_seq_stop,
2048 .show = dev_seq_show,
2049};
2050
2051static int dev_seq_open(struct inode *inode, struct file *file)
2052{
2053 return seq_open(file, &dev_seq_ops);
2054}
2055
2056static struct file_operations dev_seq_fops = {
2057 .owner = THIS_MODULE,
2058 .open = dev_seq_open,
2059 .read = seq_read,
2060 .llseek = seq_lseek,
2061 .release = seq_release,
2062};
2063
2064static struct seq_operations softnet_seq_ops = {
2065 .start = softnet_seq_start,
2066 .next = softnet_seq_next,
2067 .stop = softnet_seq_stop,
2068 .show = softnet_seq_show,
2069};
2070
2071static int softnet_seq_open(struct inode *inode, struct file *file)
2072{
2073 return seq_open(file, &softnet_seq_ops);
2074}
2075
2076static struct file_operations softnet_seq_fops = {
2077 .owner = THIS_MODULE,
2078 .open = softnet_seq_open,
2079 .read = seq_read,
2080 .llseek = seq_lseek,
2081 .release = seq_release,
2082};
2083
2084#ifdef WIRELESS_EXT
2085extern int wireless_proc_init(void);
2086#else
2087#define wireless_proc_init() 0
2088#endif
2089
2090static int __init dev_proc_init(void)
2091{
2092 int rc = -ENOMEM;
2093
2094 if (!proc_net_fops_create("dev", S_IRUGO, &dev_seq_fops))
2095 goto out;
2096 if (!proc_net_fops_create("softnet_stat", S_IRUGO, &softnet_seq_fops))
2097 goto out_dev;
2098 if (wireless_proc_init())
2099 goto out_softnet;
2100 rc = 0;
2101out:
2102 return rc;
2103out_softnet:
2104 proc_net_remove("softnet_stat");
2105out_dev:
2106 proc_net_remove("dev");
2107 goto out;
2108}
2109#else
2110#define dev_proc_init() 0
2111#endif /* CONFIG_PROC_FS */
2112
2113
2114/**
2115 * netdev_set_master - set up master/slave pair
2116 * @slave: slave device
2117 * @master: new master device
2118 *
2119 * Changes the master device of the slave. Pass %NULL to break the
2120 * bonding. The caller must hold the RTNL semaphore. On a failure
2121 * a negative errno code is returned. On success the reference counts
2122 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
2123 * function returns zero.
2124 */
2125int netdev_set_master(struct net_device *slave, struct net_device *master)
2126{
2127 struct net_device *old = slave->master;
2128
2129 ASSERT_RTNL();
2130
2131 if (master) {
2132 if (old)
2133 return -EBUSY;
2134 dev_hold(master);
2135 }
2136
2137 slave->master = master;
2138
2139 synchronize_net();
2140
2141 if (old)
2142 dev_put(old);
2143
2144 if (master)
2145 slave->flags |= IFF_SLAVE;
2146 else
2147 slave->flags &= ~IFF_SLAVE;
2148
2149 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
2150 return 0;
2151}
2152
2153/**
2154 * dev_set_promiscuity - update promiscuity count on a device
2155 * @dev: device
2156 * @inc: modifier
2157 *
2158 * Add or remove promsicuity from a device. While the count in the device
2159 * remains above zero the interface remains promiscuous. Once it hits zero
2160 * the device reverts back to normal filtering operation. A negative inc
2161 * value is used to drop promiscuity on the device.
2162 */
2163void dev_set_promiscuity(struct net_device *dev, int inc)
2164{
2165 unsigned short old_flags = dev->flags;
2166
2167 dev->flags |= IFF_PROMISC;
2168 if ((dev->promiscuity += inc) == 0)
2169 dev->flags &= ~IFF_PROMISC;
2170 if (dev->flags ^ old_flags) {
2171 dev_mc_upload(dev);
2172 printk(KERN_INFO "device %s %s promiscuous mode\n",
2173 dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
2174 "left");
2175 }
2176}
2177
2178/**
2179 * dev_set_allmulti - update allmulti count on a device
2180 * @dev: device
2181 * @inc: modifier
2182 *
2183 * Add or remove reception of all multicast frames to a device. While the
2184 * count in the device remains above zero the interface remains listening
2185 * to all interfaces. Once it hits zero the device reverts back to normal
2186 * filtering operation. A negative @inc value is used to drop the counter
2187 * when releasing a resource needing all multicasts.
2188 */
2189
2190void dev_set_allmulti(struct net_device *dev, int inc)
2191{
2192 unsigned short old_flags = dev->flags;
2193
2194 dev->flags |= IFF_ALLMULTI;
2195 if ((dev->allmulti += inc) == 0)
2196 dev->flags &= ~IFF_ALLMULTI;
2197 if (dev->flags ^ old_flags)
2198 dev_mc_upload(dev);
2199}
2200
2201unsigned dev_get_flags(const struct net_device *dev)
2202{
2203 unsigned flags;
2204
2205 flags = (dev->flags & ~(IFF_PROMISC |
2206 IFF_ALLMULTI |
2207 IFF_RUNNING)) |
2208 (dev->gflags & (IFF_PROMISC |
2209 IFF_ALLMULTI));
2210
2211 if (netif_running(dev) && netif_carrier_ok(dev))
2212 flags |= IFF_RUNNING;
2213
2214 return flags;
2215}
2216
2217int dev_change_flags(struct net_device *dev, unsigned flags)
2218{
2219 int ret;
2220 int old_flags = dev->flags;
2221
2222 /*
2223 * Set the flags on our device.
2224 */
2225
2226 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
2227 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
2228 IFF_AUTOMEDIA)) |
2229 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
2230 IFF_ALLMULTI));
2231
2232 /*
2233 * Load in the correct multicast list now the flags have changed.
2234 */
2235
2236 dev_mc_upload(dev);
2237
2238 /*
2239 * Have we downed the interface. We handle IFF_UP ourselves
2240 * according to user attempts to set it, rather than blindly
2241 * setting it.
2242 */
2243
2244 ret = 0;
2245 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
2246 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
2247
2248 if (!ret)
2249 dev_mc_upload(dev);
2250 }
2251
2252 if (dev->flags & IFF_UP &&
2253 ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
2254 IFF_VOLATILE)))
2255 notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
2256
2257 if ((flags ^ dev->gflags) & IFF_PROMISC) {
2258 int inc = (flags & IFF_PROMISC) ? +1 : -1;
2259 dev->gflags ^= IFF_PROMISC;
2260 dev_set_promiscuity(dev, inc);
2261 }
2262
2263 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
2264 is important. Some (broken) drivers set IFF_PROMISC, when
2265 IFF_ALLMULTI is requested not asking us and not reporting.
2266 */
2267 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
2268 int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
2269 dev->gflags ^= IFF_ALLMULTI;
2270 dev_set_allmulti(dev, inc);
2271 }
2272
2273 if (old_flags ^ dev->flags)
2274 rtmsg_ifinfo(RTM_NEWLINK, dev, old_flags ^ dev->flags);
2275
2276 return ret;
2277}
2278
2279int dev_set_mtu(struct net_device *dev, int new_mtu)
2280{
2281 int err;
2282
2283 if (new_mtu == dev->mtu)
2284 return 0;
2285
2286 /* MTU must be positive. */
2287 if (new_mtu < 0)
2288 return -EINVAL;
2289
2290 if (!netif_device_present(dev))
2291 return -ENODEV;
2292
2293 err = 0;
2294 if (dev->change_mtu)
2295 err = dev->change_mtu(dev, new_mtu);
2296 else
2297 dev->mtu = new_mtu;
2298 if (!err && dev->flags & IFF_UP)
2299 notifier_call_chain(&netdev_chain,
2300 NETDEV_CHANGEMTU, dev);
2301 return err;
2302}
2303
2304int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
2305{
2306 int err;
2307
2308 if (!dev->set_mac_address)
2309 return -EOPNOTSUPP;
2310 if (sa->sa_family != dev->type)
2311 return -EINVAL;
2312 if (!netif_device_present(dev))
2313 return -ENODEV;
2314 err = dev->set_mac_address(dev, sa);
2315 if (!err)
2316 notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev);
2317 return err;
2318}
2319
2320/*
2321 * Perform the SIOCxIFxxx calls.
2322 */
2323static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd)
2324{
2325 int err;
2326 struct net_device *dev = __dev_get_by_name(ifr->ifr_name);
2327
2328 if (!dev)
2329 return -ENODEV;
2330
2331 switch (cmd) {
2332 case SIOCGIFFLAGS: /* Get interface flags */
2333 ifr->ifr_flags = dev_get_flags(dev);
2334 return 0;
2335
2336 case SIOCSIFFLAGS: /* Set interface flags */
2337 return dev_change_flags(dev, ifr->ifr_flags);
2338
2339 case SIOCGIFMETRIC: /* Get the metric on the interface
2340 (currently unused) */
2341 ifr->ifr_metric = 0;
2342 return 0;
2343
2344 case SIOCSIFMETRIC: /* Set the metric on the interface
2345 (currently unused) */
2346 return -EOPNOTSUPP;
2347
2348 case SIOCGIFMTU: /* Get the MTU of a device */
2349 ifr->ifr_mtu = dev->mtu;
2350 return 0;
2351
2352 case SIOCSIFMTU: /* Set the MTU of a device */
2353 return dev_set_mtu(dev, ifr->ifr_mtu);
2354
2355 case SIOCGIFHWADDR:
2356 if (!dev->addr_len)
2357 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
2358 else
2359 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
2360 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
2361 ifr->ifr_hwaddr.sa_family = dev->type;
2362 return 0;
2363
2364 case SIOCSIFHWADDR:
2365 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
2366
2367 case SIOCSIFHWBROADCAST:
2368 if (ifr->ifr_hwaddr.sa_family != dev->type)
2369 return -EINVAL;
2370 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
2371 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
2372 notifier_call_chain(&netdev_chain,
2373 NETDEV_CHANGEADDR, dev);
2374 return 0;
2375
2376 case SIOCGIFMAP:
2377 ifr->ifr_map.mem_start = dev->mem_start;
2378 ifr->ifr_map.mem_end = dev->mem_end;
2379 ifr->ifr_map.base_addr = dev->base_addr;
2380 ifr->ifr_map.irq = dev->irq;
2381 ifr->ifr_map.dma = dev->dma;
2382 ifr->ifr_map.port = dev->if_port;
2383 return 0;
2384
2385 case SIOCSIFMAP:
2386 if (dev->set_config) {
2387 if (!netif_device_present(dev))
2388 return -ENODEV;
2389 return dev->set_config(dev, &ifr->ifr_map);
2390 }
2391 return -EOPNOTSUPP;
2392
2393 case SIOCADDMULTI:
2394 if (!dev->set_multicast_list ||
2395 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
2396 return -EINVAL;
2397 if (!netif_device_present(dev))
2398 return -ENODEV;
2399 return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
2400 dev->addr_len, 1);
2401
2402 case SIOCDELMULTI:
2403 if (!dev->set_multicast_list ||
2404 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
2405 return -EINVAL;
2406 if (!netif_device_present(dev))
2407 return -ENODEV;
2408 return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
2409 dev->addr_len, 1);
2410
2411 case SIOCGIFINDEX:
2412 ifr->ifr_ifindex = dev->ifindex;
2413 return 0;
2414
2415 case SIOCGIFTXQLEN:
2416 ifr->ifr_qlen = dev->tx_queue_len;
2417 return 0;
2418
2419 case SIOCSIFTXQLEN:
2420 if (ifr->ifr_qlen < 0)
2421 return -EINVAL;
2422 dev->tx_queue_len = ifr->ifr_qlen;
2423 return 0;
2424
2425 case SIOCSIFNAME:
2426 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
2427 return dev_change_name(dev, ifr->ifr_newname);
2428
2429 /*
2430 * Unknown or private ioctl
2431 */
2432
2433 default:
2434 if ((cmd >= SIOCDEVPRIVATE &&
2435 cmd <= SIOCDEVPRIVATE + 15) ||
2436 cmd == SIOCBONDENSLAVE ||
2437 cmd == SIOCBONDRELEASE ||
2438 cmd == SIOCBONDSETHWADDR ||
2439 cmd == SIOCBONDSLAVEINFOQUERY ||
2440 cmd == SIOCBONDINFOQUERY ||
2441 cmd == SIOCBONDCHANGEACTIVE ||
2442 cmd == SIOCGMIIPHY ||
2443 cmd == SIOCGMIIREG ||
2444 cmd == SIOCSMIIREG ||
2445 cmd == SIOCBRADDIF ||
2446 cmd == SIOCBRDELIF ||
2447 cmd == SIOCWANDEV) {
2448 err = -EOPNOTSUPP;
2449 if (dev->do_ioctl) {
2450 if (netif_device_present(dev))
2451 err = dev->do_ioctl(dev, ifr,
2452 cmd);
2453 else
2454 err = -ENODEV;
2455 }
2456 } else
2457 err = -EINVAL;
2458
2459 }
2460 return err;
2461}
2462
2463/*
2464 * This function handles all "interface"-type I/O control requests. The actual
2465 * 'doing' part of this is dev_ifsioc above.
2466 */
2467
2468/**
2469 * dev_ioctl - network device ioctl
2470 * @cmd: command to issue
2471 * @arg: pointer to a struct ifreq in user space
2472 *
2473 * Issue ioctl functions to devices. This is normally called by the
2474 * user space syscall interfaces but can sometimes be useful for
2475 * other purposes. The return value is the return from the syscall if
2476 * positive or a negative errno code on error.
2477 */
2478
2479int dev_ioctl(unsigned int cmd, void __user *arg)
2480{
2481 struct ifreq ifr;
2482 int ret;
2483 char *colon;
2484
2485 /* One special case: SIOCGIFCONF takes ifconf argument
2486 and requires shared lock, because it sleeps writing
2487 to user space.
2488 */
2489
2490 if (cmd == SIOCGIFCONF) {
2491 rtnl_shlock();
2492 ret = dev_ifconf((char __user *) arg);
2493 rtnl_shunlock();
2494 return ret;
2495 }
2496 if (cmd == SIOCGIFNAME)
2497 return dev_ifname((struct ifreq __user *)arg);
2498
2499 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2500 return -EFAULT;
2501
2502 ifr.ifr_name[IFNAMSIZ-1] = 0;
2503
2504 colon = strchr(ifr.ifr_name, ':');
2505 if (colon)
2506 *colon = 0;
2507
2508 /*
2509 * See which interface the caller is talking about.
2510 */
2511
2512 switch (cmd) {
2513 /*
2514 * These ioctl calls:
2515 * - can be done by all.
2516 * - atomic and do not require locking.
2517 * - return a value
2518 */
2519 case SIOCGIFFLAGS:
2520 case SIOCGIFMETRIC:
2521 case SIOCGIFMTU:
2522 case SIOCGIFHWADDR:
2523 case SIOCGIFSLAVE:
2524 case SIOCGIFMAP:
2525 case SIOCGIFINDEX:
2526 case SIOCGIFTXQLEN:
2527 dev_load(ifr.ifr_name);
2528 read_lock(&dev_base_lock);
2529 ret = dev_ifsioc(&ifr, cmd);
2530 read_unlock(&dev_base_lock);
2531 if (!ret) {
2532 if (colon)
2533 *colon = ':';
2534 if (copy_to_user(arg, &ifr,
2535 sizeof(struct ifreq)))
2536 ret = -EFAULT;
2537 }
2538 return ret;
2539
2540 case SIOCETHTOOL:
2541 dev_load(ifr.ifr_name);
2542 rtnl_lock();
2543 ret = dev_ethtool(&ifr);
2544 rtnl_unlock();
2545 if (!ret) {
2546 if (colon)
2547 *colon = ':';
2548 if (copy_to_user(arg, &ifr,
2549 sizeof(struct ifreq)))
2550 ret = -EFAULT;
2551 }
2552 return ret;
2553
2554 /*
2555 * These ioctl calls:
2556 * - require superuser power.
2557 * - require strict serialization.
2558 * - return a value
2559 */
2560 case SIOCGMIIPHY:
2561 case SIOCGMIIREG:
2562 case SIOCSIFNAME:
2563 if (!capable(CAP_NET_ADMIN))
2564 return -EPERM;
2565 dev_load(ifr.ifr_name);
2566 rtnl_lock();
2567 ret = dev_ifsioc(&ifr, cmd);
2568 rtnl_unlock();
2569 if (!ret) {
2570 if (colon)
2571 *colon = ':';
2572 if (copy_to_user(arg, &ifr,
2573 sizeof(struct ifreq)))
2574 ret = -EFAULT;
2575 }
2576 return ret;
2577
2578 /*
2579 * These ioctl calls:
2580 * - require superuser power.
2581 * - require strict serialization.
2582 * - do not return a value
2583 */
2584 case SIOCSIFFLAGS:
2585 case SIOCSIFMETRIC:
2586 case SIOCSIFMTU:
2587 case SIOCSIFMAP:
2588 case SIOCSIFHWADDR:
2589 case SIOCSIFSLAVE:
2590 case SIOCADDMULTI:
2591 case SIOCDELMULTI:
2592 case SIOCSIFHWBROADCAST:
2593 case SIOCSIFTXQLEN:
2594 case SIOCSMIIREG:
2595 case SIOCBONDENSLAVE:
2596 case SIOCBONDRELEASE:
2597 case SIOCBONDSETHWADDR:
2598 case SIOCBONDSLAVEINFOQUERY:
2599 case SIOCBONDINFOQUERY:
2600 case SIOCBONDCHANGEACTIVE:
2601 case SIOCBRADDIF:
2602 case SIOCBRDELIF:
2603 if (!capable(CAP_NET_ADMIN))
2604 return -EPERM;
2605 dev_load(ifr.ifr_name);
2606 rtnl_lock();
2607 ret = dev_ifsioc(&ifr, cmd);
2608 rtnl_unlock();
2609 return ret;
2610
2611 case SIOCGIFMEM:
2612 /* Get the per device memory space. We can add this but
2613 * currently do not support it */
2614 case SIOCSIFMEM:
2615 /* Set the per device memory buffer space.
2616 * Not applicable in our case */
2617 case SIOCSIFLINK:
2618 return -EINVAL;
2619
2620 /*
2621 * Unknown or private ioctl.
2622 */
2623 default:
2624 if (cmd == SIOCWANDEV ||
2625 (cmd >= SIOCDEVPRIVATE &&
2626 cmd <= SIOCDEVPRIVATE + 15)) {
2627 dev_load(ifr.ifr_name);
2628 rtnl_lock();
2629 ret = dev_ifsioc(&ifr, cmd);
2630 rtnl_unlock();
2631 if (!ret && copy_to_user(arg, &ifr,
2632 sizeof(struct ifreq)))
2633 ret = -EFAULT;
2634 return ret;
2635 }
2636#ifdef WIRELESS_EXT
2637 /* Take care of Wireless Extensions */
2638 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
2639 /* If command is `set a parameter', or
2640 * `get the encoding parameters', check if
2641 * the user has the right to do it */
2642 if (IW_IS_SET(cmd) || cmd == SIOCGIWENCODE) {
2643 if (!capable(CAP_NET_ADMIN))
2644 return -EPERM;
2645 }
2646 dev_load(ifr.ifr_name);
2647 rtnl_lock();
2648 /* Follow me in net/core/wireless.c */
2649 ret = wireless_process_ioctl(&ifr, cmd);
2650 rtnl_unlock();
2651 if (IW_IS_GET(cmd) &&
2652 copy_to_user(arg, &ifr,
2653 sizeof(struct ifreq)))
2654 ret = -EFAULT;
2655 return ret;
2656 }
2657#endif /* WIRELESS_EXT */
2658 return -EINVAL;
2659 }
2660}
2661
2662
2663/**
2664 * dev_new_index - allocate an ifindex
2665 *
2666 * Returns a suitable unique value for a new device interface
2667 * number. The caller must hold the rtnl semaphore or the
2668 * dev_base_lock to be sure it remains unique.
2669 */
2670static int dev_new_index(void)
2671{
2672 static int ifindex;
2673 for (;;) {
2674 if (++ifindex <= 0)
2675 ifindex = 1;
2676 if (!__dev_get_by_index(ifindex))
2677 return ifindex;
2678 }
2679}
2680
2681static int dev_boot_phase = 1;
2682
2683/* Delayed registration/unregisteration */
2684static DEFINE_SPINLOCK(net_todo_list_lock);
2685static struct list_head net_todo_list = LIST_HEAD_INIT(net_todo_list);
2686
2687static inline void net_set_todo(struct net_device *dev)
2688{
2689 spin_lock(&net_todo_list_lock);
2690 list_add_tail(&dev->todo_list, &net_todo_list);
2691 spin_unlock(&net_todo_list_lock);
2692}
2693
2694/**
2695 * register_netdevice - register a network device
2696 * @dev: device to register
2697 *
2698 * Take a completed network device structure and add it to the kernel
2699 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
2700 * chain. 0 is returned on success. A negative errno code is returned
2701 * on a failure to set up the device, or if the name is a duplicate.
2702 *
2703 * Callers must hold the rtnl semaphore. You may want
2704 * register_netdev() instead of this.
2705 *
2706 * BUGS:
2707 * The locking appears insufficient to guarantee two parallel registers
2708 * will not get the same name.
2709 */
2710
2711int register_netdevice(struct net_device *dev)
2712{
2713 struct hlist_head *head;
2714 struct hlist_node *p;
2715 int ret;
2716
2717 BUG_ON(dev_boot_phase);
2718 ASSERT_RTNL();
2719
2720 /* When net_device's are persistent, this will be fatal. */
2721 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
2722
2723 spin_lock_init(&dev->queue_lock);
2724 spin_lock_init(&dev->xmit_lock);
2725 dev->xmit_lock_owner = -1;
2726#ifdef CONFIG_NET_CLS_ACT
2727 spin_lock_init(&dev->ingress_lock);
2728#endif
2729
2730 ret = alloc_divert_blk(dev);
2731 if (ret)
2732 goto out;
2733
2734 dev->iflink = -1;
2735
2736 /* Init, if this function is available */
2737 if (dev->init) {
2738 ret = dev->init(dev);
2739 if (ret) {
2740 if (ret > 0)
2741 ret = -EIO;
2742 goto out_err;
2743 }
2744 }
2745
2746 if (!dev_valid_name(dev->name)) {
2747 ret = -EINVAL;
2748 goto out_err;
2749 }
2750
2751 dev->ifindex = dev_new_index();
2752 if (dev->iflink == -1)
2753 dev->iflink = dev->ifindex;
2754
2755 /* Check for existence of name */
2756 head = dev_name_hash(dev->name);
2757 hlist_for_each(p, head) {
2758 struct net_device *d
2759 = hlist_entry(p, struct net_device, name_hlist);
2760 if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
2761 ret = -EEXIST;
2762 goto out_err;
2763 }
2764 }
2765
2766 /* Fix illegal SG+CSUM combinations. */
2767 if ((dev->features & NETIF_F_SG) &&
2768 !(dev->features & (NETIF_F_IP_CSUM |
2769 NETIF_F_NO_CSUM |
2770 NETIF_F_HW_CSUM))) {
2771 printk("%s: Dropping NETIF_F_SG since no checksum feature.\n",
2772 dev->name);
2773 dev->features &= ~NETIF_F_SG;
2774 }
2775
2776 /* TSO requires that SG is present as well. */
2777 if ((dev->features & NETIF_F_TSO) &&
2778 !(dev->features & NETIF_F_SG)) {
2779 printk("%s: Dropping NETIF_F_TSO since no SG feature.\n",
2780 dev->name);
2781 dev->features &= ~NETIF_F_TSO;
2782 }
2783
2784 /*
2785 * nil rebuild_header routine,
2786 * that should be never called and used as just bug trap.
2787 */
2788
2789 if (!dev->rebuild_header)
2790 dev->rebuild_header = default_rebuild_header;
2791
2792 /*
2793 * Default initial state at registry is that the
2794 * device is present.
2795 */
2796
2797 set_bit(__LINK_STATE_PRESENT, &dev->state);
2798
2799 dev->next = NULL;
2800 dev_init_scheduler(dev);
2801 write_lock_bh(&dev_base_lock);
2802 *dev_tail = dev;
2803 dev_tail = &dev->next;
2804 hlist_add_head(&dev->name_hlist, head);
2805 hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex));
2806 dev_hold(dev);
2807 dev->reg_state = NETREG_REGISTERING;
2808 write_unlock_bh(&dev_base_lock);
2809
2810 /* Notify protocols, that a new device appeared. */
2811 notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
2812
2813 /* Finish registration after unlock */
2814 net_set_todo(dev);
2815 ret = 0;
2816
2817out:
2818 return ret;
2819out_err:
2820 free_divert_blk(dev);
2821 goto out;
2822}
2823
2824/**
2825 * register_netdev - register a network device
2826 * @dev: device to register
2827 *
2828 * Take a completed network device structure and add it to the kernel
2829 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
2830 * chain. 0 is returned on success. A negative errno code is returned
2831 * on a failure to set up the device, or if the name is a duplicate.
2832 *
2833 * This is a wrapper around register_netdev that takes the rtnl semaphore
2834 * and expands the device name if you passed a format string to
2835 * alloc_netdev.
2836 */
2837int register_netdev(struct net_device *dev)
2838{
2839 int err;
2840
2841 rtnl_lock();
2842
2843 /*
2844 * If the name is a format string the caller wants us to do a
2845 * name allocation.
2846 */
2847 if (strchr(dev->name, '%')) {
2848 err = dev_alloc_name(dev, dev->name);
2849 if (err < 0)
2850 goto out;
2851 }
2852
2853 /*
2854 * Back compatibility hook. Kill this one in 2.5
2855 */
2856 if (dev->name[0] == 0 || dev->name[0] == ' ') {
2857 err = dev_alloc_name(dev, "eth%d");
2858 if (err < 0)
2859 goto out;
2860 }
2861
2862 err = register_netdevice(dev);
2863out:
2864 rtnl_unlock();
2865 return err;
2866}
2867EXPORT_SYMBOL(register_netdev);
2868
2869/*
2870 * netdev_wait_allrefs - wait until all references are gone.
2871 *
2872 * This is called when unregistering network devices.
2873 *
2874 * Any protocol or device that holds a reference should register
2875 * for netdevice notification, and cleanup and put back the
2876 * reference if they receive an UNREGISTER event.
2877 * We can get stuck here if buggy protocols don't correctly
2878 * call dev_put.
2879 */
2880static void netdev_wait_allrefs(struct net_device *dev)
2881{
2882 unsigned long rebroadcast_time, warning_time;
2883
2884 rebroadcast_time = warning_time = jiffies;
2885 while (atomic_read(&dev->refcnt) != 0) {
2886 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
2887 rtnl_shlock();
2888
2889 /* Rebroadcast unregister notification */
2890 notifier_call_chain(&netdev_chain,
2891 NETDEV_UNREGISTER, dev);
2892
2893 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
2894 &dev->state)) {
2895 /* We must not have linkwatch events
2896 * pending on unregister. If this
2897 * happens, we simply run the queue
2898 * unscheduled, resulting in a noop
2899 * for this device.
2900 */
2901 linkwatch_run_queue();
2902 }
2903
2904 rtnl_shunlock();
2905
2906 rebroadcast_time = jiffies;
2907 }
2908
2909 msleep(250);
2910
2911 if (time_after(jiffies, warning_time + 10 * HZ)) {
2912 printk(KERN_EMERG "unregister_netdevice: "
2913 "waiting for %s to become free. Usage "
2914 "count = %d\n",
2915 dev->name, atomic_read(&dev->refcnt));
2916 warning_time = jiffies;
2917 }
2918 }
2919}
2920
2921/* The sequence is:
2922 *
2923 * rtnl_lock();
2924 * ...
2925 * register_netdevice(x1);
2926 * register_netdevice(x2);
2927 * ...
2928 * unregister_netdevice(y1);
2929 * unregister_netdevice(y2);
2930 * ...
2931 * rtnl_unlock();
2932 * free_netdev(y1);
2933 * free_netdev(y2);
2934 *
2935 * We are invoked by rtnl_unlock() after it drops the semaphore.
2936 * This allows us to deal with problems:
2937 * 1) We can create/delete sysfs objects which invoke hotplug
2938 * without deadlocking with linkwatch via keventd.
2939 * 2) Since we run with the RTNL semaphore not held, we can sleep
2940 * safely in order to wait for the netdev refcnt to drop to zero.
2941 */
2942static DECLARE_MUTEX(net_todo_run_mutex);
2943void netdev_run_todo(void)
2944{
2945 struct list_head list = LIST_HEAD_INIT(list);
2946 int err;
2947
2948
2949 /* Need to guard against multiple cpu's getting out of order. */
2950 down(&net_todo_run_mutex);
2951
2952 /* Not safe to do outside the semaphore. We must not return
2953 * until all unregister events invoked by the local processor
2954 * have been completed (either by this todo run, or one on
2955 * another cpu).
2956 */
2957 if (list_empty(&net_todo_list))
2958 goto out;
2959
2960 /* Snapshot list, allow later requests */
2961 spin_lock(&net_todo_list_lock);
2962 list_splice_init(&net_todo_list, &list);
2963 spin_unlock(&net_todo_list_lock);
2964
2965 while (!list_empty(&list)) {
2966 struct net_device *dev
2967 = list_entry(list.next, struct net_device, todo_list);
2968 list_del(&dev->todo_list);
2969
2970 switch(dev->reg_state) {
2971 case NETREG_REGISTERING:
2972 err = netdev_register_sysfs(dev);
2973 if (err)
2974 printk(KERN_ERR "%s: failed sysfs registration (%d)\n",
2975 dev->name, err);
2976 dev->reg_state = NETREG_REGISTERED;
2977 break;
2978
2979 case NETREG_UNREGISTERING:
2980 netdev_unregister_sysfs(dev);
2981 dev->reg_state = NETREG_UNREGISTERED;
2982
2983 netdev_wait_allrefs(dev);
2984
2985 /* paranoia */
2986 BUG_ON(atomic_read(&dev->refcnt));
2987 BUG_TRAP(!dev->ip_ptr);
2988 BUG_TRAP(!dev->ip6_ptr);
2989 BUG_TRAP(!dev->dn_ptr);
2990
2991
2992 /* It must be the very last action,
2993 * after this 'dev' may point to freed up memory.
2994 */
2995 if (dev->destructor)
2996 dev->destructor(dev);
2997 break;
2998
2999 default:
3000 printk(KERN_ERR "network todo '%s' but state %d\n",
3001 dev->name, dev->reg_state);
3002 break;
3003 }
3004 }
3005
3006out:
3007 up(&net_todo_run_mutex);
3008}
3009
3010/**
3011 * alloc_netdev - allocate network device
3012 * @sizeof_priv: size of private data to allocate space for
3013 * @name: device name format string
3014 * @setup: callback to initialize device
3015 *
3016 * Allocates a struct net_device with private data area for driver use
3017 * and performs basic initialization.
3018 */
3019struct net_device *alloc_netdev(int sizeof_priv, const char *name,
3020 void (*setup)(struct net_device *))
3021{
3022 void *p;
3023 struct net_device *dev;
3024 int alloc_size;
3025
3026 /* ensure 32-byte alignment of both the device and private area */
3027 alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
3028 alloc_size += sizeof_priv + NETDEV_ALIGN_CONST;
3029
3030 p = kmalloc(alloc_size, GFP_KERNEL);
3031 if (!p) {
3032 printk(KERN_ERR "alloc_dev: Unable to allocate device.\n");
3033 return NULL;
3034 }
3035 memset(p, 0, alloc_size);
3036
3037 dev = (struct net_device *)
3038 (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
3039 dev->padded = (char *)dev - (char *)p;
3040
3041 if (sizeof_priv)
3042 dev->priv = netdev_priv(dev);
3043
3044 setup(dev);
3045 strcpy(dev->name, name);
3046 return dev;
3047}
3048EXPORT_SYMBOL(alloc_netdev);
3049
3050/**
3051 * free_netdev - free network device
3052 * @dev: device
3053 *
3054 * This function does the last stage of destroying an allocated device
3055 * interface. The reference to the device object is released.
3056 * If this is the last reference then it will be freed.
3057 */
3058void free_netdev(struct net_device *dev)
3059{
3060#ifdef CONFIG_SYSFS
3061 /* Compatiablity with error handling in drivers */
3062 if (dev->reg_state == NETREG_UNINITIALIZED) {
3063 kfree((char *)dev - dev->padded);
3064 return;
3065 }
3066
3067 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
3068 dev->reg_state = NETREG_RELEASED;
3069
3070 /* will free via class release */
3071 class_device_put(&dev->class_dev);
3072#else
3073 kfree((char *)dev - dev->padded);
3074#endif
3075}
3076
3077/* Synchronize with packet receive processing. */
3078void synchronize_net(void)
3079{
3080 might_sleep();
3081 synchronize_kernel();
3082}
3083
3084/**
3085 * unregister_netdevice - remove device from the kernel
3086 * @dev: device
3087 *
3088 * This function shuts down a device interface and removes it
3089 * from the kernel tables. On success 0 is returned, on a failure
3090 * a negative errno code is returned.
3091 *
3092 * Callers must hold the rtnl semaphore. You may want
3093 * unregister_netdev() instead of this.
3094 */
3095
3096int unregister_netdevice(struct net_device *dev)
3097{
3098 struct net_device *d, **dp;
3099
3100 BUG_ON(dev_boot_phase);
3101 ASSERT_RTNL();
3102
3103 /* Some devices call without registering for initialization unwind. */
3104 if (dev->reg_state == NETREG_UNINITIALIZED) {
3105 printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
3106 "was registered\n", dev->name, dev);
3107 return -ENODEV;
3108 }
3109
3110 BUG_ON(dev->reg_state != NETREG_REGISTERED);
3111
3112 /* If device is running, close it first. */
3113 if (dev->flags & IFF_UP)
3114 dev_close(dev);
3115
3116 /* And unlink it from device chain. */
3117 for (dp = &dev_base; (d = *dp) != NULL; dp = &d->next) {
3118 if (d == dev) {
3119 write_lock_bh(&dev_base_lock);
3120 hlist_del(&dev->name_hlist);
3121 hlist_del(&dev->index_hlist);
3122 if (dev_tail == &dev->next)
3123 dev_tail = dp;
3124 *dp = d->next;
3125 write_unlock_bh(&dev_base_lock);
3126 break;
3127 }
3128 }
3129 if (!d) {
3130 printk(KERN_ERR "unregister net_device: '%s' not found\n",
3131 dev->name);
3132 return -ENODEV;
3133 }
3134
3135 dev->reg_state = NETREG_UNREGISTERING;
3136
3137 synchronize_net();
3138
3139 /* Shutdown queueing discipline. */
3140 dev_shutdown(dev);
3141
3142
3143 /* Notify protocols, that we are about to destroy
3144 this device. They should clean all the things.
3145 */
3146 notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
3147
3148 /*
3149 * Flush the multicast chain
3150 */
3151 dev_mc_discard(dev);
3152
3153 if (dev->uninit)
3154 dev->uninit(dev);
3155
3156 /* Notifier chain MUST detach us from master device. */
3157 BUG_TRAP(!dev->master);
3158
3159 free_divert_blk(dev);
3160
3161 /* Finish processing unregister after unlock */
3162 net_set_todo(dev);
3163
3164 synchronize_net();
3165
3166 dev_put(dev);
3167 return 0;
3168}
3169
3170/**
3171 * unregister_netdev - remove device from the kernel
3172 * @dev: device
3173 *
3174 * This function shuts down a device interface and removes it
3175 * from the kernel tables. On success 0 is returned, on a failure
3176 * a negative errno code is returned.
3177 *
3178 * This is just a wrapper for unregister_netdevice that takes
3179 * the rtnl semaphore. In general you want to use this and not
3180 * unregister_netdevice.
3181 */
3182void unregister_netdev(struct net_device *dev)
3183{
3184 rtnl_lock();
3185 unregister_netdevice(dev);
3186 rtnl_unlock();
3187}
3188
3189EXPORT_SYMBOL(unregister_netdev);
3190
3191#ifdef CONFIG_HOTPLUG_CPU
3192static int dev_cpu_callback(struct notifier_block *nfb,
3193 unsigned long action,
3194 void *ocpu)
3195{
3196 struct sk_buff **list_skb;
3197 struct net_device **list_net;
3198 struct sk_buff *skb;
3199 unsigned int cpu, oldcpu = (unsigned long)ocpu;
3200 struct softnet_data *sd, *oldsd;
3201
3202 if (action != CPU_DEAD)
3203 return NOTIFY_OK;
3204
3205 local_irq_disable();
3206 cpu = smp_processor_id();
3207 sd = &per_cpu(softnet_data, cpu);
3208 oldsd = &per_cpu(softnet_data, oldcpu);
3209
3210 /* Find end of our completion_queue. */
3211 list_skb = &sd->completion_queue;
3212 while (*list_skb)
3213 list_skb = &(*list_skb)->next;
3214 /* Append completion queue from offline CPU. */
3215 *list_skb = oldsd->completion_queue;
3216 oldsd->completion_queue = NULL;
3217
3218 /* Find end of our output_queue. */
3219 list_net = &sd->output_queue;
3220 while (*list_net)
3221 list_net = &(*list_net)->next_sched;
3222 /* Append output queue from offline CPU. */
3223 *list_net = oldsd->output_queue;
3224 oldsd->output_queue = NULL;
3225
3226 raise_softirq_irqoff(NET_TX_SOFTIRQ);
3227 local_irq_enable();
3228
3229 /* Process offline CPU's input_pkt_queue */
3230 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
3231 netif_rx(skb);
3232
3233 return NOTIFY_OK;
3234}
3235#endif /* CONFIG_HOTPLUG_CPU */
3236
3237
3238/*
3239 * Initialize the DEV module. At boot time this walks the device list and
3240 * unhooks any devices that fail to initialise (normally hardware not
3241 * present) and leaves us with a valid list of present and active devices.
3242 *
3243 */
3244
3245/*
3246 * This is called single threaded during boot, so no need
3247 * to take the rtnl semaphore.
3248 */
3249static int __init net_dev_init(void)
3250{
3251 int i, rc = -ENOMEM;
3252
3253 BUG_ON(!dev_boot_phase);
3254
3255 net_random_init();
3256
3257 if (dev_proc_init())
3258 goto out;
3259
3260 if (netdev_sysfs_init())
3261 goto out;
3262
3263 INIT_LIST_HEAD(&ptype_all);
3264 for (i = 0; i < 16; i++)
3265 INIT_LIST_HEAD(&ptype_base[i]);
3266
3267 for (i = 0; i < ARRAY_SIZE(dev_name_head); i++)
3268 INIT_HLIST_HEAD(&dev_name_head[i]);
3269
3270 for (i = 0; i < ARRAY_SIZE(dev_index_head); i++)
3271 INIT_HLIST_HEAD(&dev_index_head[i]);
3272
3273 /*
3274 * Initialise the packet receive queues.
3275 */
3276
3277 for (i = 0; i < NR_CPUS; i++) {
3278 struct softnet_data *queue;
3279
3280 queue = &per_cpu(softnet_data, i);
3281 skb_queue_head_init(&queue->input_pkt_queue);
3282 queue->throttle = 0;
3283 queue->cng_level = 0;
3284 queue->avg_blog = 10; /* arbitrary non-zero */
3285 queue->completion_queue = NULL;
3286 INIT_LIST_HEAD(&queue->poll_list);
3287 set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
3288 queue->backlog_dev.weight = weight_p;
3289 queue->backlog_dev.poll = process_backlog;
3290 atomic_set(&queue->backlog_dev.refcnt, 1);
3291 }
3292
3293#ifdef OFFLINE_SAMPLE
3294 samp_timer.expires = jiffies + (10 * HZ);
3295 add_timer(&samp_timer);
3296#endif
3297
3298 dev_boot_phase = 0;
3299
3300 open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
3301 open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
3302
3303 hotcpu_notifier(dev_cpu_callback, 0);
3304 dst_init();
3305 dev_mcast_init();
3306 rc = 0;
3307out:
3308 return rc;
3309}
3310
3311subsys_initcall(net_dev_init);
3312
3313EXPORT_SYMBOL(__dev_get_by_index);
3314EXPORT_SYMBOL(__dev_get_by_name);
3315EXPORT_SYMBOL(__dev_remove_pack);
3316EXPORT_SYMBOL(__skb_linearize);
3317EXPORT_SYMBOL(dev_add_pack);
3318EXPORT_SYMBOL(dev_alloc_name);
3319EXPORT_SYMBOL(dev_close);
3320EXPORT_SYMBOL(dev_get_by_flags);
3321EXPORT_SYMBOL(dev_get_by_index);
3322EXPORT_SYMBOL(dev_get_by_name);
3323EXPORT_SYMBOL(dev_ioctl);
3324EXPORT_SYMBOL(dev_open);
3325EXPORT_SYMBOL(dev_queue_xmit);
3326EXPORT_SYMBOL(dev_remove_pack);
3327EXPORT_SYMBOL(dev_set_allmulti);
3328EXPORT_SYMBOL(dev_set_promiscuity);
3329EXPORT_SYMBOL(dev_change_flags);
3330EXPORT_SYMBOL(dev_set_mtu);
3331EXPORT_SYMBOL(dev_set_mac_address);
3332EXPORT_SYMBOL(free_netdev);
3333EXPORT_SYMBOL(netdev_boot_setup_check);
3334EXPORT_SYMBOL(netdev_set_master);
3335EXPORT_SYMBOL(netdev_state_change);
3336EXPORT_SYMBOL(netif_receive_skb);
3337EXPORT_SYMBOL(netif_rx);
3338EXPORT_SYMBOL(register_gifconf);
3339EXPORT_SYMBOL(register_netdevice);
3340EXPORT_SYMBOL(register_netdevice_notifier);
3341EXPORT_SYMBOL(skb_checksum_help);
3342EXPORT_SYMBOL(synchronize_net);
3343EXPORT_SYMBOL(unregister_netdevice);
3344EXPORT_SYMBOL(unregister_netdevice_notifier);
3345EXPORT_SYMBOL(net_enable_timestamp);
3346EXPORT_SYMBOL(net_disable_timestamp);
3347EXPORT_SYMBOL(dev_get_flags);
3348
3349#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
3350EXPORT_SYMBOL(br_handle_frame_hook);
3351EXPORT_SYMBOL(br_fdb_get_hook);
3352EXPORT_SYMBOL(br_fdb_put_hook);
3353#endif
3354
3355#ifdef CONFIG_KMOD
3356EXPORT_SYMBOL(dev_load);
3357#endif
3358
3359EXPORT_PER_CPU_SYMBOL(softnet_data);
diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c
new file mode 100644
index 000000000000..db098ff3cd6a
--- /dev/null
+++ b/net/core/dev_mcast.c
@@ -0,0 +1,299 @@
1/*
2 * Linux NET3: Multicast List maintenance.
3 *
4 * Authors:
5 * Tim Kordas <tjk@nostromo.eeap.cwru.edu>
6 * Richard Underwood <richard@wuzz.demon.co.uk>
7 *
8 * Stir fried together from the IP multicast and CAP patches above
9 * Alan Cox <Alan.Cox@linux.org>
10 *
11 * Fixes:
12 * Alan Cox : Update the device on a real delete
13 * rather than any time but...
14 * Alan Cox : IFF_ALLMULTI support.
15 * Alan Cox : New format set_multicast_list() calls.
16 * Gleb Natapov : Remove dev_mc_lock.
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24#include <linux/config.h>
25#include <linux/module.h>
26#include <asm/uaccess.h>
27#include <asm/system.h>
28#include <linux/bitops.h>
29#include <linux/types.h>
30#include <linux/kernel.h>
31#include <linux/sched.h>
32#include <linux/string.h>
33#include <linux/mm.h>
34#include <linux/socket.h>
35#include <linux/sockios.h>
36#include <linux/in.h>
37#include <linux/errno.h>
38#include <linux/interrupt.h>
39#include <linux/if_ether.h>
40#include <linux/inet.h>
41#include <linux/netdevice.h>
42#include <linux/etherdevice.h>
43#include <linux/proc_fs.h>
44#include <linux/seq_file.h>
45#include <linux/init.h>
46#include <net/ip.h>
47#include <net/route.h>
48#include <linux/skbuff.h>
49#include <net/sock.h>
50#include <net/arp.h>
51
52
53/*
54 * Device multicast list maintenance.
55 *
56 * This is used both by IP and by the user level maintenance functions.
57 * Unlike BSD we maintain a usage count on a given multicast address so
58 * that a casual user application can add/delete multicasts used by
59 * protocols without doing damage to the protocols when it deletes the
60 * entries. It also helps IP as it tracks overlapping maps.
61 *
62 * Device mc lists are changed by bh at least if IPv6 is enabled,
63 * so that it must be bh protected.
64 *
65 * We block accesses to device mc filters with dev->xmit_lock.
66 */
67
68/*
69 * Update the multicast list into the physical NIC controller.
70 */
71
72static void __dev_mc_upload(struct net_device *dev)
73{
74 /* Don't do anything till we up the interface
75 * [dev_open will call this function so the list will
76 * stay sane]
77 */
78
79 if (!(dev->flags&IFF_UP))
80 return;
81
82 /*
83 * Devices with no set multicast or which have been
84 * detached don't get set.
85 */
86
87 if (dev->set_multicast_list == NULL ||
88 !netif_device_present(dev))
89 return;
90
91 dev->set_multicast_list(dev);
92}
93
94void dev_mc_upload(struct net_device *dev)
95{
96 spin_lock_bh(&dev->xmit_lock);
97 __dev_mc_upload(dev);
98 spin_unlock_bh(&dev->xmit_lock);
99}
100
101/*
102 * Delete a device level multicast
103 */
104
105int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl)
106{
107 int err = 0;
108 struct dev_mc_list *dmi, **dmip;
109
110 spin_lock_bh(&dev->xmit_lock);
111
112 for (dmip = &dev->mc_list; (dmi = *dmip) != NULL; dmip = &dmi->next) {
113 /*
114 * Find the entry we want to delete. The device could
115 * have variable length entries so check these too.
116 */
117 if (memcmp(dmi->dmi_addr, addr, dmi->dmi_addrlen) == 0 &&
118 alen == dmi->dmi_addrlen) {
119 if (glbl) {
120 int old_glbl = dmi->dmi_gusers;
121 dmi->dmi_gusers = 0;
122 if (old_glbl == 0)
123 break;
124 }
125 if (--dmi->dmi_users)
126 goto done;
127
128 /*
129 * Last user. So delete the entry.
130 */
131 *dmip = dmi->next;
132 dev->mc_count--;
133
134 kfree(dmi);
135
136 /*
137 * We have altered the list, so the card
138 * loaded filter is now wrong. Fix it
139 */
140 __dev_mc_upload(dev);
141
142 spin_unlock_bh(&dev->xmit_lock);
143 return 0;
144 }
145 }
146 err = -ENOENT;
147done:
148 spin_unlock_bh(&dev->xmit_lock);
149 return err;
150}
151
152/*
153 * Add a device level multicast
154 */
155
156int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl)
157{
158 int err = 0;
159 struct dev_mc_list *dmi, *dmi1;
160
161 dmi1 = (struct dev_mc_list *)kmalloc(sizeof(*dmi), GFP_ATOMIC);
162
163 spin_lock_bh(&dev->xmit_lock);
164 for (dmi = dev->mc_list; dmi != NULL; dmi = dmi->next) {
165 if (memcmp(dmi->dmi_addr, addr, dmi->dmi_addrlen) == 0 &&
166 dmi->dmi_addrlen == alen) {
167 if (glbl) {
168 int old_glbl = dmi->dmi_gusers;
169 dmi->dmi_gusers = 1;
170 if (old_glbl)
171 goto done;
172 }
173 dmi->dmi_users++;
174 goto done;
175 }
176 }
177
178 if ((dmi = dmi1) == NULL) {
179 spin_unlock_bh(&dev->xmit_lock);
180 return -ENOMEM;
181 }
182 memcpy(dmi->dmi_addr, addr, alen);
183 dmi->dmi_addrlen = alen;
184 dmi->next = dev->mc_list;
185 dmi->dmi_users = 1;
186 dmi->dmi_gusers = glbl ? 1 : 0;
187 dev->mc_list = dmi;
188 dev->mc_count++;
189
190 __dev_mc_upload(dev);
191
192 spin_unlock_bh(&dev->xmit_lock);
193 return 0;
194
195done:
196 spin_unlock_bh(&dev->xmit_lock);
197 if (dmi1)
198 kfree(dmi1);
199 return err;
200}
201
202/*
203 * Discard multicast list when a device is downed
204 */
205
206void dev_mc_discard(struct net_device *dev)
207{
208 spin_lock_bh(&dev->xmit_lock);
209
210 while (dev->mc_list != NULL) {
211 struct dev_mc_list *tmp = dev->mc_list;
212 dev->mc_list = tmp->next;
213 if (tmp->dmi_users > tmp->dmi_gusers)
214 printk("dev_mc_discard: multicast leakage! dmi_users=%d\n", tmp->dmi_users);
215 kfree(tmp);
216 }
217 dev->mc_count = 0;
218
219 spin_unlock_bh(&dev->xmit_lock);
220}
221
222#ifdef CONFIG_PROC_FS
223static void *dev_mc_seq_start(struct seq_file *seq, loff_t *pos)
224{
225 struct net_device *dev;
226 loff_t off = 0;
227
228 read_lock(&dev_base_lock);
229 for (dev = dev_base; dev; dev = dev->next) {
230 if (off++ == *pos)
231 return dev;
232 }
233 return NULL;
234}
235
236static void *dev_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
237{
238 struct net_device *dev = v;
239 ++*pos;
240 return dev->next;
241}
242
243static void dev_mc_seq_stop(struct seq_file *seq, void *v)
244{
245 read_unlock(&dev_base_lock);
246}
247
248
249static int dev_mc_seq_show(struct seq_file *seq, void *v)
250{
251 struct dev_mc_list *m;
252 struct net_device *dev = v;
253
254 spin_lock_bh(&dev->xmit_lock);
255 for (m = dev->mc_list; m; m = m->next) {
256 int i;
257
258 seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex,
259 dev->name, m->dmi_users, m->dmi_gusers);
260
261 for (i = 0; i < m->dmi_addrlen; i++)
262 seq_printf(seq, "%02x", m->dmi_addr[i]);
263
264 seq_putc(seq, '\n');
265 }
266 spin_unlock_bh(&dev->xmit_lock);
267 return 0;
268}
269
270static struct seq_operations dev_mc_seq_ops = {
271 .start = dev_mc_seq_start,
272 .next = dev_mc_seq_next,
273 .stop = dev_mc_seq_stop,
274 .show = dev_mc_seq_show,
275};
276
277static int dev_mc_seq_open(struct inode *inode, struct file *file)
278{
279 return seq_open(file, &dev_mc_seq_ops);
280}
281
282static struct file_operations dev_mc_seq_fops = {
283 .owner = THIS_MODULE,
284 .open = dev_mc_seq_open,
285 .read = seq_read,
286 .llseek = seq_lseek,
287 .release = seq_release,
288};
289
290#endif
291
292void __init dev_mcast_init(void)
293{
294 proc_net_fops_create("dev_mcast", 0, &dev_mc_seq_fops);
295}
296
297EXPORT_SYMBOL(dev_mc_add);
298EXPORT_SYMBOL(dev_mc_delete);
299EXPORT_SYMBOL(dev_mc_upload);
diff --git a/net/core/dst.c b/net/core/dst.c
new file mode 100644
index 000000000000..3bf6cc434814
--- /dev/null
+++ b/net/core/dst.c
@@ -0,0 +1,276 @@
1/*
2 * net/core/dst.c Protocol independent destination cache.
3 *
4 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
5 *
6 */
7
8#include <linux/bitops.h>
9#include <linux/errno.h>
10#include <linux/init.h>
11#include <linux/kernel.h>
12#include <linux/mm.h>
13#include <linux/module.h>
14#include <linux/netdevice.h>
15#include <linux/sched.h>
16#include <linux/skbuff.h>
17#include <linux/string.h>
18#include <linux/types.h>
19
20#include <net/dst.h>
21
22/* Locking strategy:
23 * 1) Garbage collection state of dead destination cache
24 * entries is protected by dst_lock.
25 * 2) GC is run only from BH context, and is the only remover
26 * of entries.
27 * 3) Entries are added to the garbage list from both BH
28 * and non-BH context, so local BH disabling is needed.
29 * 4) All operations modify state, so a spinlock is used.
30 */
31static struct dst_entry *dst_garbage_list;
32#if RT_CACHE_DEBUG >= 2
33static atomic_t dst_total = ATOMIC_INIT(0);
34#endif
35static DEFINE_SPINLOCK(dst_lock);
36
37static unsigned long dst_gc_timer_expires;
38static unsigned long dst_gc_timer_inc = DST_GC_MAX;
39static void dst_run_gc(unsigned long);
40static void ___dst_free(struct dst_entry * dst);
41
42static struct timer_list dst_gc_timer =
43 TIMER_INITIALIZER(dst_run_gc, DST_GC_MIN, 0);
44
45static void dst_run_gc(unsigned long dummy)
46{
47 int delayed = 0;
48 struct dst_entry * dst, **dstp;
49
50 if (!spin_trylock(&dst_lock)) {
51 mod_timer(&dst_gc_timer, jiffies + HZ/10);
52 return;
53 }
54
55
56 del_timer(&dst_gc_timer);
57 dstp = &dst_garbage_list;
58 while ((dst = *dstp) != NULL) {
59 if (atomic_read(&dst->__refcnt)) {
60 dstp = &dst->next;
61 delayed++;
62 continue;
63 }
64 *dstp = dst->next;
65
66 dst = dst_destroy(dst);
67 if (dst) {
68 /* NOHASH and still referenced. Unless it is already
69 * on gc list, invalidate it and add to gc list.
70 *
71 * Note: this is temporary. Actually, NOHASH dst's
72 * must be obsoleted when parent is obsoleted.
73 * But we do not have state "obsoleted, but
74 * referenced by parent", so it is right.
75 */
76 if (dst->obsolete > 1)
77 continue;
78
79 ___dst_free(dst);
80 dst->next = *dstp;
81 *dstp = dst;
82 dstp = &dst->next;
83 }
84 }
85 if (!dst_garbage_list) {
86 dst_gc_timer_inc = DST_GC_MAX;
87 goto out;
88 }
89 if ((dst_gc_timer_expires += dst_gc_timer_inc) > DST_GC_MAX)
90 dst_gc_timer_expires = DST_GC_MAX;
91 dst_gc_timer_inc += DST_GC_INC;
92 dst_gc_timer.expires = jiffies + dst_gc_timer_expires;
93#if RT_CACHE_DEBUG >= 2
94 printk("dst_total: %d/%d %ld\n",
95 atomic_read(&dst_total), delayed, dst_gc_timer_expires);
96#endif
97 add_timer(&dst_gc_timer);
98
99out:
100 spin_unlock(&dst_lock);
101}
102
103static int dst_discard_in(struct sk_buff *skb)
104{
105 kfree_skb(skb);
106 return 0;
107}
108
109static int dst_discard_out(struct sk_buff *skb)
110{
111 kfree_skb(skb);
112 return 0;
113}
114
115void * dst_alloc(struct dst_ops * ops)
116{
117 struct dst_entry * dst;
118
119 if (ops->gc && atomic_read(&ops->entries) > ops->gc_thresh) {
120 if (ops->gc())
121 return NULL;
122 }
123 dst = kmem_cache_alloc(ops->kmem_cachep, SLAB_ATOMIC);
124 if (!dst)
125 return NULL;
126 memset(dst, 0, ops->entry_size);
127 atomic_set(&dst->__refcnt, 0);
128 dst->ops = ops;
129 dst->lastuse = jiffies;
130 dst->path = dst;
131 dst->input = dst_discard_in;
132 dst->output = dst_discard_out;
133#if RT_CACHE_DEBUG >= 2
134 atomic_inc(&dst_total);
135#endif
136 atomic_inc(&ops->entries);
137 return dst;
138}
139
140static void ___dst_free(struct dst_entry * dst)
141{
142 /* The first case (dev==NULL) is required, when
143 protocol module is unloaded.
144 */
145 if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) {
146 dst->input = dst_discard_in;
147 dst->output = dst_discard_out;
148 }
149 dst->obsolete = 2;
150}
151
152void __dst_free(struct dst_entry * dst)
153{
154 spin_lock_bh(&dst_lock);
155 ___dst_free(dst);
156 dst->next = dst_garbage_list;
157 dst_garbage_list = dst;
158 if (dst_gc_timer_inc > DST_GC_INC) {
159 dst_gc_timer_inc = DST_GC_INC;
160 dst_gc_timer_expires = DST_GC_MIN;
161 mod_timer(&dst_gc_timer, jiffies + dst_gc_timer_expires);
162 }
163 spin_unlock_bh(&dst_lock);
164}
165
166struct dst_entry *dst_destroy(struct dst_entry * dst)
167{
168 struct dst_entry *child;
169 struct neighbour *neigh;
170 struct hh_cache *hh;
171
172 smp_rmb();
173
174again:
175 neigh = dst->neighbour;
176 hh = dst->hh;
177 child = dst->child;
178
179 dst->hh = NULL;
180 if (hh && atomic_dec_and_test(&hh->hh_refcnt))
181 kfree(hh);
182
183 if (neigh) {
184 dst->neighbour = NULL;
185 neigh_release(neigh);
186 }
187
188 atomic_dec(&dst->ops->entries);
189
190 if (dst->ops->destroy)
191 dst->ops->destroy(dst);
192 if (dst->dev)
193 dev_put(dst->dev);
194#if RT_CACHE_DEBUG >= 2
195 atomic_dec(&dst_total);
196#endif
197 kmem_cache_free(dst->ops->kmem_cachep, dst);
198
199 dst = child;
200 if (dst) {
201 if (atomic_dec_and_test(&dst->__refcnt)) {
202 /* We were real parent of this dst, so kill child. */
203 if (dst->flags&DST_NOHASH)
204 goto again;
205 } else {
206 /* Child is still referenced, return it for freeing. */
207 if (dst->flags&DST_NOHASH)
208 return dst;
209 /* Child is still in his hash table */
210 }
211 }
212 return NULL;
213}
214
215/* Dirty hack. We did it in 2.2 (in __dst_free),
216 * we have _very_ good reasons not to repeat
217 * this mistake in 2.3, but we have no choice
218 * now. _It_ _is_ _explicit_ _deliberate_
219 * _race_ _condition_.
220 *
221 * Commented and originally written by Alexey.
222 */
223static inline void dst_ifdown(struct dst_entry *dst, struct net_device *dev,
224 int unregister)
225{
226 if (dst->ops->ifdown)
227 dst->ops->ifdown(dst, dev, unregister);
228
229 if (dev != dst->dev)
230 return;
231
232 if (!unregister) {
233 dst->input = dst_discard_in;
234 dst->output = dst_discard_out;
235 } else {
236 dst->dev = &loopback_dev;
237 dev_hold(&loopback_dev);
238 dev_put(dev);
239 if (dst->neighbour && dst->neighbour->dev == dev) {
240 dst->neighbour->dev = &loopback_dev;
241 dev_put(dev);
242 dev_hold(&loopback_dev);
243 }
244 }
245}
246
247static int dst_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
248{
249 struct net_device *dev = ptr;
250 struct dst_entry *dst;
251
252 switch (event) {
253 case NETDEV_UNREGISTER:
254 case NETDEV_DOWN:
255 spin_lock_bh(&dst_lock);
256 for (dst = dst_garbage_list; dst; dst = dst->next) {
257 dst_ifdown(dst, dev, event != NETDEV_DOWN);
258 }
259 spin_unlock_bh(&dst_lock);
260 break;
261 }
262 return NOTIFY_DONE;
263}
264
265static struct notifier_block dst_dev_notifier = {
266 .notifier_call = dst_dev_event,
267};
268
269void __init dst_init(void)
270{
271 register_netdevice_notifier(&dst_dev_notifier);
272}
273
274EXPORT_SYMBOL(__dst_free);
275EXPORT_SYMBOL(dst_alloc);
276EXPORT_SYMBOL(dst_destroy);
diff --git a/net/core/dv.c b/net/core/dv.c
new file mode 100644
index 000000000000..3f25f4aa4e66
--- /dev/null
+++ b/net/core/dv.c
@@ -0,0 +1,548 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic frame diversion
7 *
8 * Authors:
9 * Benoit LOCHER: initial integration within the kernel with support for ethernet
10 * Dave Miller: improvement on the code (correctness, performance and source files)
11 *
12 */
13#include <linux/module.h>
14#include <linux/types.h>
15#include <linux/kernel.h>
16#include <linux/sched.h>
17#include <linux/string.h>
18#include <linux/mm.h>
19#include <linux/socket.h>
20#include <linux/in.h>
21#include <linux/inet.h>
22#include <linux/ip.h>
23#include <linux/udp.h>
24#include <linux/netdevice.h>
25#include <linux/etherdevice.h>
26#include <linux/skbuff.h>
27#include <linux/errno.h>
28#include <linux/init.h>
29#include <net/dst.h>
30#include <net/arp.h>
31#include <net/sock.h>
32#include <net/ipv6.h>
33#include <net/ip.h>
34#include <asm/uaccess.h>
35#include <asm/system.h>
36#include <asm/checksum.h>
37#include <linux/divert.h>
38#include <linux/sockios.h>
39
40const char sysctl_divert_version[32]="0.46"; /* Current version */
41
42static int __init dv_init(void)
43{
44 return 0;
45}
46module_init(dv_init);
47
48/*
49 * Allocate a divert_blk for a device. This must be an ethernet nic.
50 */
51int alloc_divert_blk(struct net_device *dev)
52{
53 int alloc_size = (sizeof(struct divert_blk) + 3) & ~3;
54
55 dev->divert = NULL;
56 if (dev->type == ARPHRD_ETHER) {
57 dev->divert = (struct divert_blk *)
58 kmalloc(alloc_size, GFP_KERNEL);
59 if (dev->divert == NULL) {
60 printk(KERN_INFO "divert: unable to allocate divert_blk for %s\n",
61 dev->name);
62 return -ENOMEM;
63 }
64
65 memset(dev->divert, 0, sizeof(struct divert_blk));
66 dev_hold(dev);
67 }
68
69 return 0;
70}
71
72/*
73 * Free a divert_blk allocated by the above function, if it was
74 * allocated on that device.
75 */
76void free_divert_blk(struct net_device *dev)
77{
78 if (dev->divert) {
79 kfree(dev->divert);
80 dev->divert=NULL;
81 dev_put(dev);
82 }
83}
84
85/*
86 * Adds a tcp/udp (source or dest) port to an array
87 */
88static int add_port(u16 ports[], u16 port)
89{
90 int i;
91
92 if (port == 0)
93 return -EINVAL;
94
95 /* Storing directly in network format for performance,
96 * thanks Dave :)
97 */
98 port = htons(port);
99
100 for (i = 0; i < MAX_DIVERT_PORTS; i++) {
101 if (ports[i] == port)
102 return -EALREADY;
103 }
104
105 for (i = 0; i < MAX_DIVERT_PORTS; i++) {
106 if (ports[i] == 0) {
107 ports[i] = port;
108 return 0;
109 }
110 }
111
112 return -ENOBUFS;
113}
114
115/*
116 * Removes a port from an array tcp/udp (source or dest)
117 */
118static int remove_port(u16 ports[], u16 port)
119{
120 int i;
121
122 if (port == 0)
123 return -EINVAL;
124
125 /* Storing directly in network format for performance,
126 * thanks Dave !
127 */
128 port = htons(port);
129
130 for (i = 0; i < MAX_DIVERT_PORTS; i++) {
131 if (ports[i] == port) {
132 ports[i] = 0;
133 return 0;
134 }
135 }
136
137 return -EINVAL;
138}
139
140/* Some basic sanity checks on the arguments passed to divert_ioctl() */
141static int check_args(struct divert_cf *div_cf, struct net_device **dev)
142{
143 char devname[32];
144 int ret;
145
146 if (dev == NULL)
147 return -EFAULT;
148
149 /* GETVERSION: all other args are unused */
150 if (div_cf->cmd == DIVCMD_GETVERSION)
151 return 0;
152
153 /* Network device index should reasonably be between 0 and 1000 :) */
154 if (div_cf->dev_index < 0 || div_cf->dev_index > 1000)
155 return -EINVAL;
156
157 /* Let's try to find the ifname */
158 sprintf(devname, "eth%d", div_cf->dev_index);
159 *dev = dev_get_by_name(devname);
160
161 /* dev should NOT be null */
162 if (*dev == NULL)
163 return -EINVAL;
164
165 ret = 0;
166
167 /* user issuing the ioctl must be a super one :) */
168 if (!capable(CAP_SYS_ADMIN)) {
169 ret = -EPERM;
170 goto out;
171 }
172
173 /* Device must have a divert_blk member NOT null */
174 if ((*dev)->divert == NULL)
175 ret = -EINVAL;
176out:
177 dev_put(*dev);
178 return ret;
179}
180
181/*
182 * control function of the diverter
183 */
184#if 0
185#define DVDBG(a) \
186 printk(KERN_DEBUG "divert_ioctl() line %d %s\n", __LINE__, (a))
187#else
188#define DVDBG(a)
189#endif
190
191int divert_ioctl(unsigned int cmd, struct divert_cf __user *arg)
192{
193 struct divert_cf div_cf;
194 struct divert_blk *div_blk;
195 struct net_device *dev;
196 int ret;
197
198 switch (cmd) {
199 case SIOCGIFDIVERT:
200 DVDBG("SIOCGIFDIVERT, copy_from_user");
201 if (copy_from_user(&div_cf, arg, sizeof(struct divert_cf)))
202 return -EFAULT;
203 DVDBG("before check_args");
204 ret = check_args(&div_cf, &dev);
205 if (ret)
206 return ret;
207 DVDBG("after checkargs");
208 div_blk = dev->divert;
209
210 DVDBG("befre switch()");
211 switch (div_cf.cmd) {
212 case DIVCMD_GETSTATUS:
213 /* Now, just give the user the raw divert block
214 * for him to play with :)
215 */
216 if (copy_to_user(div_cf.arg1.ptr, dev->divert,
217 sizeof(struct divert_blk)))
218 return -EFAULT;
219 break;
220
221 case DIVCMD_GETVERSION:
222 DVDBG("GETVERSION: checking ptr");
223 if (div_cf.arg1.ptr == NULL)
224 return -EINVAL;
225 DVDBG("GETVERSION: copying data to userland");
226 if (copy_to_user(div_cf.arg1.ptr,
227 sysctl_divert_version, 32))
228 return -EFAULT;
229 DVDBG("GETVERSION: data copied");
230 break;
231
232 default:
233 return -EINVAL;
234 }
235
236 break;
237
238 case SIOCSIFDIVERT:
239 if (copy_from_user(&div_cf, arg, sizeof(struct divert_cf)))
240 return -EFAULT;
241
242 ret = check_args(&div_cf, &dev);
243 if (ret)
244 return ret;
245
246 div_blk = dev->divert;
247
248 switch(div_cf.cmd) {
249 case DIVCMD_RESET:
250 div_blk->divert = 0;
251 div_blk->protos = DIVERT_PROTO_NONE;
252 memset(div_blk->tcp_dst, 0,
253 MAX_DIVERT_PORTS * sizeof(u16));
254 memset(div_blk->tcp_src, 0,
255 MAX_DIVERT_PORTS * sizeof(u16));
256 memset(div_blk->udp_dst, 0,
257 MAX_DIVERT_PORTS * sizeof(u16));
258 memset(div_blk->udp_src, 0,
259 MAX_DIVERT_PORTS * sizeof(u16));
260 return 0;
261
262 case DIVCMD_DIVERT:
263 switch(div_cf.arg1.int32) {
264 case DIVARG1_ENABLE:
265 if (div_blk->divert)
266 return -EALREADY;
267 div_blk->divert = 1;
268 break;
269
270 case DIVARG1_DISABLE:
271 if (!div_blk->divert)
272 return -EALREADY;
273 div_blk->divert = 0;
274 break;
275
276 default:
277 return -EINVAL;
278 }
279
280 break;
281
282 case DIVCMD_IP:
283 switch(div_cf.arg1.int32) {
284 case DIVARG1_ENABLE:
285 if (div_blk->protos & DIVERT_PROTO_IP)
286 return -EALREADY;
287 div_blk->protos |= DIVERT_PROTO_IP;
288 break;
289
290 case DIVARG1_DISABLE:
291 if (!(div_blk->protos & DIVERT_PROTO_IP))
292 return -EALREADY;
293 div_blk->protos &= ~DIVERT_PROTO_IP;
294 break;
295
296 default:
297 return -EINVAL;
298 }
299
300 break;
301
302 case DIVCMD_TCP:
303 switch(div_cf.arg1.int32) {
304 case DIVARG1_ENABLE:
305 if (div_blk->protos & DIVERT_PROTO_TCP)
306 return -EALREADY;
307 div_blk->protos |= DIVERT_PROTO_TCP;
308 break;
309
310 case DIVARG1_DISABLE:
311 if (!(div_blk->protos & DIVERT_PROTO_TCP))
312 return -EALREADY;
313 div_blk->protos &= ~DIVERT_PROTO_TCP;
314 break;
315
316 default:
317 return -EINVAL;
318 }
319
320 break;
321
322 case DIVCMD_TCPDST:
323 switch(div_cf.arg1.int32) {
324 case DIVARG1_ADD:
325 return add_port(div_blk->tcp_dst,
326 div_cf.arg2.uint16);
327
328 case DIVARG1_REMOVE:
329 return remove_port(div_blk->tcp_dst,
330 div_cf.arg2.uint16);
331
332 default:
333 return -EINVAL;
334 }
335
336 break;
337
338 case DIVCMD_TCPSRC:
339 switch(div_cf.arg1.int32) {
340 case DIVARG1_ADD:
341 return add_port(div_blk->tcp_src,
342 div_cf.arg2.uint16);
343
344 case DIVARG1_REMOVE:
345 return remove_port(div_blk->tcp_src,
346 div_cf.arg2.uint16);
347
348 default:
349 return -EINVAL;
350 }
351
352 break;
353
354 case DIVCMD_UDP:
355 switch(div_cf.arg1.int32) {
356 case DIVARG1_ENABLE:
357 if (div_blk->protos & DIVERT_PROTO_UDP)
358 return -EALREADY;
359 div_blk->protos |= DIVERT_PROTO_UDP;
360 break;
361
362 case DIVARG1_DISABLE:
363 if (!(div_blk->protos & DIVERT_PROTO_UDP))
364 return -EALREADY;
365 div_blk->protos &= ~DIVERT_PROTO_UDP;
366 break;
367
368 default:
369 return -EINVAL;
370 }
371
372 break;
373
374 case DIVCMD_UDPDST:
375 switch(div_cf.arg1.int32) {
376 case DIVARG1_ADD:
377 return add_port(div_blk->udp_dst,
378 div_cf.arg2.uint16);
379
380 case DIVARG1_REMOVE:
381 return remove_port(div_blk->udp_dst,
382 div_cf.arg2.uint16);
383
384 default:
385 return -EINVAL;
386 }
387
388 break;
389
390 case DIVCMD_UDPSRC:
391 switch(div_cf.arg1.int32) {
392 case DIVARG1_ADD:
393 return add_port(div_blk->udp_src,
394 div_cf.arg2.uint16);
395
396 case DIVARG1_REMOVE:
397 return remove_port(div_blk->udp_src,
398 div_cf.arg2.uint16);
399
400 default:
401 return -EINVAL;
402 }
403
404 break;
405
406 case DIVCMD_ICMP:
407 switch(div_cf.arg1.int32) {
408 case DIVARG1_ENABLE:
409 if (div_blk->protos & DIVERT_PROTO_ICMP)
410 return -EALREADY;
411 div_blk->protos |= DIVERT_PROTO_ICMP;
412 break;
413
414 case DIVARG1_DISABLE:
415 if (!(div_blk->protos & DIVERT_PROTO_ICMP))
416 return -EALREADY;
417 div_blk->protos &= ~DIVERT_PROTO_ICMP;
418 break;
419
420 default:
421 return -EINVAL;
422 }
423
424 break;
425
426 default:
427 return -EINVAL;
428 }
429
430 break;
431
432 default:
433 return -EINVAL;
434 }
435
436 return 0;
437}
438
439
440/*
441 * Check if packet should have its dest mac address set to the box itself
442 * for diversion
443 */
444
445#define ETH_DIVERT_FRAME(skb) \
446 memcpy(eth_hdr(skb), skb->dev->dev_addr, ETH_ALEN); \
447 skb->pkt_type=PACKET_HOST
448
449void divert_frame(struct sk_buff *skb)
450{
451 struct ethhdr *eth = eth_hdr(skb);
452 struct iphdr *iph;
453 struct tcphdr *tcph;
454 struct udphdr *udph;
455 struct divert_blk *divert = skb->dev->divert;
456 int i, src, dst;
457 unsigned char *skb_data_end = skb->data + skb->len;
458
459 /* Packet is already aimed at us, return */
460 if (!memcmp(eth, skb->dev->dev_addr, ETH_ALEN))
461 return;
462
463 /* proto is not IP, do nothing */
464 if (eth->h_proto != htons(ETH_P_IP))
465 return;
466
467 /* Divert all IP frames ? */
468 if (divert->protos & DIVERT_PROTO_IP) {
469 ETH_DIVERT_FRAME(skb);
470 return;
471 }
472
473 /* Check for possible (maliciously) malformed IP frame (thanks Dave) */
474 iph = (struct iphdr *) skb->data;
475 if (((iph->ihl<<2)+(unsigned char*)(iph)) >= skb_data_end) {
476 printk(KERN_INFO "divert: malformed IP packet !\n");
477 return;
478 }
479
480 switch (iph->protocol) {
481 /* Divert all ICMP frames ? */
482 case IPPROTO_ICMP:
483 if (divert->protos & DIVERT_PROTO_ICMP) {
484 ETH_DIVERT_FRAME(skb);
485 return;
486 }
487 break;
488
489 /* Divert all TCP frames ? */
490 case IPPROTO_TCP:
491 if (divert->protos & DIVERT_PROTO_TCP) {
492 ETH_DIVERT_FRAME(skb);
493 return;
494 }
495
496 /* Check for possible (maliciously) malformed IP
497 * frame (thanx Dave)
498 */
499 tcph = (struct tcphdr *)
500 (((unsigned char *)iph) + (iph->ihl<<2));
501 if (((unsigned char *)(tcph+1)) >= skb_data_end) {
502 printk(KERN_INFO "divert: malformed TCP packet !\n");
503 return;
504 }
505
506 /* Divert some tcp dst/src ports only ?*/
507 for (i = 0; i < MAX_DIVERT_PORTS; i++) {
508 dst = divert->tcp_dst[i];
509 src = divert->tcp_src[i];
510 if ((dst && dst == tcph->dest) ||
511 (src && src == tcph->source)) {
512 ETH_DIVERT_FRAME(skb);
513 return;
514 }
515 }
516 break;
517
518 /* Divert all UDP frames ? */
519 case IPPROTO_UDP:
520 if (divert->protos & DIVERT_PROTO_UDP) {
521 ETH_DIVERT_FRAME(skb);
522 return;
523 }
524
525 /* Check for possible (maliciously) malformed IP
526 * packet (thanks Dave)
527 */
528 udph = (struct udphdr *)
529 (((unsigned char *)iph) + (iph->ihl<<2));
530 if (((unsigned char *)(udph+1)) >= skb_data_end) {
531 printk(KERN_INFO
532 "divert: malformed UDP packet !\n");
533 return;
534 }
535
536 /* Divert some udp dst/src ports only ? */
537 for (i = 0; i < MAX_DIVERT_PORTS; i++) {
538 dst = divert->udp_dst[i];
539 src = divert->udp_src[i];
540 if ((dst && dst == udph->dest) ||
541 (src && src == udph->source)) {
542 ETH_DIVERT_FRAME(skb);
543 return;
544 }
545 }
546 break;
547 }
548}
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
new file mode 100644
index 000000000000..f05fde97c43d
--- /dev/null
+++ b/net/core/ethtool.c
@@ -0,0 +1,819 @@
1/*
2 * net/core/ethtool.c - Ethtool ioctl handler
3 * Copyright (c) 2003 Matthew Wilcox <matthew@wil.cx>
4 *
5 * This file is where we call all the ethtool_ops commands to get
6 * the information ethtool needs. We fall back to calling do_ioctl()
7 * for drivers which haven't been converted to ethtool_ops yet.
8 *
9 * It's GPL, stupid.
10 */
11
12#include <linux/module.h>
13#include <linux/types.h>
14#include <linux/errno.h>
15#include <linux/ethtool.h>
16#include <linux/netdevice.h>
17#include <asm/uaccess.h>
18
19/*
20 * Some useful ethtool_ops methods that're device independent.
21 * If we find that all drivers want to do the same thing here,
22 * we can turn these into dev_() function calls.
23 */
24
25u32 ethtool_op_get_link(struct net_device *dev)
26{
27 return netif_carrier_ok(dev) ? 1 : 0;
28}
29
30u32 ethtool_op_get_tx_csum(struct net_device *dev)
31{
32 return (dev->features & NETIF_F_IP_CSUM) != 0;
33}
34
35int ethtool_op_set_tx_csum(struct net_device *dev, u32 data)
36{
37 if (data)
38 dev->features |= NETIF_F_IP_CSUM;
39 else
40 dev->features &= ~NETIF_F_IP_CSUM;
41
42 return 0;
43}
44
45u32 ethtool_op_get_sg(struct net_device *dev)
46{
47 return (dev->features & NETIF_F_SG) != 0;
48}
49
50int ethtool_op_set_sg(struct net_device *dev, u32 data)
51{
52 if (data)
53 dev->features |= NETIF_F_SG;
54 else
55 dev->features &= ~NETIF_F_SG;
56
57 return 0;
58}
59
60u32 ethtool_op_get_tso(struct net_device *dev)
61{
62 return (dev->features & NETIF_F_TSO) != 0;
63}
64
65int ethtool_op_set_tso(struct net_device *dev, u32 data)
66{
67 if (data)
68 dev->features |= NETIF_F_TSO;
69 else
70 dev->features &= ~NETIF_F_TSO;
71
72 return 0;
73}
74
75/* Handlers for each ethtool command */
76
77static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
78{
79 struct ethtool_cmd cmd = { ETHTOOL_GSET };
80 int err;
81
82 if (!dev->ethtool_ops->get_settings)
83 return -EOPNOTSUPP;
84
85 err = dev->ethtool_ops->get_settings(dev, &cmd);
86 if (err < 0)
87 return err;
88
89 if (copy_to_user(useraddr, &cmd, sizeof(cmd)))
90 return -EFAULT;
91 return 0;
92}
93
94static int ethtool_set_settings(struct net_device *dev, void __user *useraddr)
95{
96 struct ethtool_cmd cmd;
97
98 if (!dev->ethtool_ops->set_settings)
99 return -EOPNOTSUPP;
100
101 if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
102 return -EFAULT;
103
104 return dev->ethtool_ops->set_settings(dev, &cmd);
105}
106
107static int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr)
108{
109 struct ethtool_drvinfo info;
110 struct ethtool_ops *ops = dev->ethtool_ops;
111
112 if (!ops->get_drvinfo)
113 return -EOPNOTSUPP;
114
115 memset(&info, 0, sizeof(info));
116 info.cmd = ETHTOOL_GDRVINFO;
117 ops->get_drvinfo(dev, &info);
118
119 if (ops->self_test_count)
120 info.testinfo_len = ops->self_test_count(dev);
121 if (ops->get_stats_count)
122 info.n_stats = ops->get_stats_count(dev);
123 if (ops->get_regs_len)
124 info.regdump_len = ops->get_regs_len(dev);
125 if (ops->get_eeprom_len)
126 info.eedump_len = ops->get_eeprom_len(dev);
127
128 if (copy_to_user(useraddr, &info, sizeof(info)))
129 return -EFAULT;
130 return 0;
131}
132
133static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
134{
135 struct ethtool_regs regs;
136 struct ethtool_ops *ops = dev->ethtool_ops;
137 void *regbuf;
138 int reglen, ret;
139
140 if (!ops->get_regs || !ops->get_regs_len)
141 return -EOPNOTSUPP;
142
143 if (copy_from_user(&regs, useraddr, sizeof(regs)))
144 return -EFAULT;
145
146 reglen = ops->get_regs_len(dev);
147 if (regs.len > reglen)
148 regs.len = reglen;
149
150 regbuf = kmalloc(reglen, GFP_USER);
151 if (!regbuf)
152 return -ENOMEM;
153
154 ops->get_regs(dev, &regs, regbuf);
155
156 ret = -EFAULT;
157 if (copy_to_user(useraddr, &regs, sizeof(regs)))
158 goto out;
159 useraddr += offsetof(struct ethtool_regs, data);
160 if (copy_to_user(useraddr, regbuf, regs.len))
161 goto out;
162 ret = 0;
163
164 out:
165 kfree(regbuf);
166 return ret;
167}
168
169static int ethtool_get_wol(struct net_device *dev, char __user *useraddr)
170{
171 struct ethtool_wolinfo wol = { ETHTOOL_GWOL };
172
173 if (!dev->ethtool_ops->get_wol)
174 return -EOPNOTSUPP;
175
176 dev->ethtool_ops->get_wol(dev, &wol);
177
178 if (copy_to_user(useraddr, &wol, sizeof(wol)))
179 return -EFAULT;
180 return 0;
181}
182
183static int ethtool_set_wol(struct net_device *dev, char __user *useraddr)
184{
185 struct ethtool_wolinfo wol;
186
187 if (!dev->ethtool_ops->set_wol)
188 return -EOPNOTSUPP;
189
190 if (copy_from_user(&wol, useraddr, sizeof(wol)))
191 return -EFAULT;
192
193 return dev->ethtool_ops->set_wol(dev, &wol);
194}
195
196static int ethtool_get_msglevel(struct net_device *dev, char __user *useraddr)
197{
198 struct ethtool_value edata = { ETHTOOL_GMSGLVL };
199
200 if (!dev->ethtool_ops->get_msglevel)
201 return -EOPNOTSUPP;
202
203 edata.data = dev->ethtool_ops->get_msglevel(dev);
204
205 if (copy_to_user(useraddr, &edata, sizeof(edata)))
206 return -EFAULT;
207 return 0;
208}
209
210static int ethtool_set_msglevel(struct net_device *dev, char __user *useraddr)
211{
212 struct ethtool_value edata;
213
214 if (!dev->ethtool_ops->set_msglevel)
215 return -EOPNOTSUPP;
216
217 if (copy_from_user(&edata, useraddr, sizeof(edata)))
218 return -EFAULT;
219
220 dev->ethtool_ops->set_msglevel(dev, edata.data);
221 return 0;
222}
223
224static int ethtool_nway_reset(struct net_device *dev)
225{
226 if (!dev->ethtool_ops->nway_reset)
227 return -EOPNOTSUPP;
228
229 return dev->ethtool_ops->nway_reset(dev);
230}
231
232static int ethtool_get_link(struct net_device *dev, void __user *useraddr)
233{
234 struct ethtool_value edata = { ETHTOOL_GLINK };
235
236 if (!dev->ethtool_ops->get_link)
237 return -EOPNOTSUPP;
238
239 edata.data = dev->ethtool_ops->get_link(dev);
240
241 if (copy_to_user(useraddr, &edata, sizeof(edata)))
242 return -EFAULT;
243 return 0;
244}
245
246static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr)
247{
248 struct ethtool_eeprom eeprom;
249 struct ethtool_ops *ops = dev->ethtool_ops;
250 u8 *data;
251 int ret;
252
253 if (!ops->get_eeprom || !ops->get_eeprom_len)
254 return -EOPNOTSUPP;
255
256 if (copy_from_user(&eeprom, useraddr, sizeof(eeprom)))
257 return -EFAULT;
258
259 /* Check for wrap and zero */
260 if (eeprom.offset + eeprom.len <= eeprom.offset)
261 return -EINVAL;
262
263 /* Check for exceeding total eeprom len */
264 if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev))
265 return -EINVAL;
266
267 data = kmalloc(eeprom.len, GFP_USER);
268 if (!data)
269 return -ENOMEM;
270
271 ret = -EFAULT;
272 if (copy_from_user(data, useraddr + sizeof(eeprom), eeprom.len))
273 goto out;
274
275 ret = ops->get_eeprom(dev, &eeprom, data);
276 if (ret)
277 goto out;
278
279 ret = -EFAULT;
280 if (copy_to_user(useraddr, &eeprom, sizeof(eeprom)))
281 goto out;
282 if (copy_to_user(useraddr + sizeof(eeprom), data, eeprom.len))
283 goto out;
284 ret = 0;
285
286 out:
287 kfree(data);
288 return ret;
289}
290
291static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr)
292{
293 struct ethtool_eeprom eeprom;
294 struct ethtool_ops *ops = dev->ethtool_ops;
295 u8 *data;
296 int ret;
297
298 if (!ops->set_eeprom || !ops->get_eeprom_len)
299 return -EOPNOTSUPP;
300
301 if (copy_from_user(&eeprom, useraddr, sizeof(eeprom)))
302 return -EFAULT;
303
304 /* Check for wrap and zero */
305 if (eeprom.offset + eeprom.len <= eeprom.offset)
306 return -EINVAL;
307
308 /* Check for exceeding total eeprom len */
309 if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev))
310 return -EINVAL;
311
312 data = kmalloc(eeprom.len, GFP_USER);
313 if (!data)
314 return -ENOMEM;
315
316 ret = -EFAULT;
317 if (copy_from_user(data, useraddr + sizeof(eeprom), eeprom.len))
318 goto out;
319
320 ret = ops->set_eeprom(dev, &eeprom, data);
321 if (ret)
322 goto out;
323
324 if (copy_to_user(useraddr + sizeof(eeprom), data, eeprom.len))
325 ret = -EFAULT;
326
327 out:
328 kfree(data);
329 return ret;
330}
331
332static int ethtool_get_coalesce(struct net_device *dev, void __user *useraddr)
333{
334 struct ethtool_coalesce coalesce = { ETHTOOL_GCOALESCE };
335
336 if (!dev->ethtool_ops->get_coalesce)
337 return -EOPNOTSUPP;
338
339 dev->ethtool_ops->get_coalesce(dev, &coalesce);
340
341 if (copy_to_user(useraddr, &coalesce, sizeof(coalesce)))
342 return -EFAULT;
343 return 0;
344}
345
346static int ethtool_set_coalesce(struct net_device *dev, void __user *useraddr)
347{
348 struct ethtool_coalesce coalesce;
349
350 if (!dev->ethtool_ops->get_coalesce)
351 return -EOPNOTSUPP;
352
353 if (copy_from_user(&coalesce, useraddr, sizeof(coalesce)))
354 return -EFAULT;
355
356 return dev->ethtool_ops->set_coalesce(dev, &coalesce);
357}
358
359static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr)
360{
361 struct ethtool_ringparam ringparam = { ETHTOOL_GRINGPARAM };
362
363 if (!dev->ethtool_ops->get_ringparam)
364 return -EOPNOTSUPP;
365
366 dev->ethtool_ops->get_ringparam(dev, &ringparam);
367
368 if (copy_to_user(useraddr, &ringparam, sizeof(ringparam)))
369 return -EFAULT;
370 return 0;
371}
372
373static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr)
374{
375 struct ethtool_ringparam ringparam;
376
377 if (!dev->ethtool_ops->set_ringparam)
378 return -EOPNOTSUPP;
379
380 if (copy_from_user(&ringparam, useraddr, sizeof(ringparam)))
381 return -EFAULT;
382
383 return dev->ethtool_ops->set_ringparam(dev, &ringparam);
384}
385
386static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr)
387{
388 struct ethtool_pauseparam pauseparam = { ETHTOOL_GPAUSEPARAM };
389
390 if (!dev->ethtool_ops->get_pauseparam)
391 return -EOPNOTSUPP;
392
393 dev->ethtool_ops->get_pauseparam(dev, &pauseparam);
394
395 if (copy_to_user(useraddr, &pauseparam, sizeof(pauseparam)))
396 return -EFAULT;
397 return 0;
398}
399
400static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr)
401{
402 struct ethtool_pauseparam pauseparam;
403
404 if (!dev->ethtool_ops->get_pauseparam)
405 return -EOPNOTSUPP;
406
407 if (copy_from_user(&pauseparam, useraddr, sizeof(pauseparam)))
408 return -EFAULT;
409
410 return dev->ethtool_ops->set_pauseparam(dev, &pauseparam);
411}
412
413static int ethtool_get_rx_csum(struct net_device *dev, char __user *useraddr)
414{
415 struct ethtool_value edata = { ETHTOOL_GRXCSUM };
416
417 if (!dev->ethtool_ops->get_rx_csum)
418 return -EOPNOTSUPP;
419
420 edata.data = dev->ethtool_ops->get_rx_csum(dev);
421
422 if (copy_to_user(useraddr, &edata, sizeof(edata)))
423 return -EFAULT;
424 return 0;
425}
426
427static int ethtool_set_rx_csum(struct net_device *dev, char __user *useraddr)
428{
429 struct ethtool_value edata;
430
431 if (!dev->ethtool_ops->set_rx_csum)
432 return -EOPNOTSUPP;
433
434 if (copy_from_user(&edata, useraddr, sizeof(edata)))
435 return -EFAULT;
436
437 dev->ethtool_ops->set_rx_csum(dev, edata.data);
438 return 0;
439}
440
441static int ethtool_get_tx_csum(struct net_device *dev, char __user *useraddr)
442{
443 struct ethtool_value edata = { ETHTOOL_GTXCSUM };
444
445 if (!dev->ethtool_ops->get_tx_csum)
446 return -EOPNOTSUPP;
447
448 edata.data = dev->ethtool_ops->get_tx_csum(dev);
449
450 if (copy_to_user(useraddr, &edata, sizeof(edata)))
451 return -EFAULT;
452 return 0;
453}
454
455static int __ethtool_set_sg(struct net_device *dev, u32 data)
456{
457 int err;
458
459 if (!data && dev->ethtool_ops->set_tso) {
460 err = dev->ethtool_ops->set_tso(dev, 0);
461 if (err)
462 return err;
463 }
464
465 return dev->ethtool_ops->set_sg(dev, data);
466}
467
468static int ethtool_set_tx_csum(struct net_device *dev, char __user *useraddr)
469{
470 struct ethtool_value edata;
471 int err;
472
473 if (!dev->ethtool_ops->set_tx_csum)
474 return -EOPNOTSUPP;
475
476 if (copy_from_user(&edata, useraddr, sizeof(edata)))
477 return -EFAULT;
478
479 if (!edata.data && dev->ethtool_ops->set_sg) {
480 err = __ethtool_set_sg(dev, 0);
481 if (err)
482 return err;
483 }
484
485 return dev->ethtool_ops->set_tx_csum(dev, edata.data);
486}
487
488static int ethtool_get_sg(struct net_device *dev, char __user *useraddr)
489{
490 struct ethtool_value edata = { ETHTOOL_GSG };
491
492 if (!dev->ethtool_ops->get_sg)
493 return -EOPNOTSUPP;
494
495 edata.data = dev->ethtool_ops->get_sg(dev);
496
497 if (copy_to_user(useraddr, &edata, sizeof(edata)))
498 return -EFAULT;
499 return 0;
500}
501
502static int ethtool_set_sg(struct net_device *dev, char __user *useraddr)
503{
504 struct ethtool_value edata;
505
506 if (!dev->ethtool_ops->set_sg)
507 return -EOPNOTSUPP;
508
509 if (copy_from_user(&edata, useraddr, sizeof(edata)))
510 return -EFAULT;
511
512 if (edata.data &&
513 !(dev->features & (NETIF_F_IP_CSUM |
514 NETIF_F_NO_CSUM |
515 NETIF_F_HW_CSUM)))
516 return -EINVAL;
517
518 return __ethtool_set_sg(dev, edata.data);
519}
520
521static int ethtool_get_tso(struct net_device *dev, char __user *useraddr)
522{
523 struct ethtool_value edata = { ETHTOOL_GTSO };
524
525 if (!dev->ethtool_ops->get_tso)
526 return -EOPNOTSUPP;
527
528 edata.data = dev->ethtool_ops->get_tso(dev);
529
530 if (copy_to_user(useraddr, &edata, sizeof(edata)))
531 return -EFAULT;
532 return 0;
533}
534
535static int ethtool_set_tso(struct net_device *dev, char __user *useraddr)
536{
537 struct ethtool_value edata;
538
539 if (!dev->ethtool_ops->set_tso)
540 return -EOPNOTSUPP;
541
542 if (copy_from_user(&edata, useraddr, sizeof(edata)))
543 return -EFAULT;
544
545 if (edata.data && !(dev->features & NETIF_F_SG))
546 return -EINVAL;
547
548 return dev->ethtool_ops->set_tso(dev, edata.data);
549}
550
551static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
552{
553 struct ethtool_test test;
554 struct ethtool_ops *ops = dev->ethtool_ops;
555 u64 *data;
556 int ret;
557
558 if (!ops->self_test || !ops->self_test_count)
559 return -EOPNOTSUPP;
560
561 if (copy_from_user(&test, useraddr, sizeof(test)))
562 return -EFAULT;
563
564 test.len = ops->self_test_count(dev);
565 data = kmalloc(test.len * sizeof(u64), GFP_USER);
566 if (!data)
567 return -ENOMEM;
568
569 ops->self_test(dev, &test, data);
570
571 ret = -EFAULT;
572 if (copy_to_user(useraddr, &test, sizeof(test)))
573 goto out;
574 useraddr += sizeof(test);
575 if (copy_to_user(useraddr, data, test.len * sizeof(u64)))
576 goto out;
577 ret = 0;
578
579 out:
580 kfree(data);
581 return ret;
582}
583
584static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
585{
586 struct ethtool_gstrings gstrings;
587 struct ethtool_ops *ops = dev->ethtool_ops;
588 u8 *data;
589 int ret;
590
591 if (!ops->get_strings)
592 return -EOPNOTSUPP;
593
594 if (copy_from_user(&gstrings, useraddr, sizeof(gstrings)))
595 return -EFAULT;
596
597 switch (gstrings.string_set) {
598 case ETH_SS_TEST:
599 if (!ops->self_test_count)
600 return -EOPNOTSUPP;
601 gstrings.len = ops->self_test_count(dev);
602 break;
603 case ETH_SS_STATS:
604 if (!ops->get_stats_count)
605 return -EOPNOTSUPP;
606 gstrings.len = ops->get_stats_count(dev);
607 break;
608 default:
609 return -EINVAL;
610 }
611
612 data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER);
613 if (!data)
614 return -ENOMEM;
615
616 ops->get_strings(dev, gstrings.string_set, data);
617
618 ret = -EFAULT;
619 if (copy_to_user(useraddr, &gstrings, sizeof(gstrings)))
620 goto out;
621 useraddr += sizeof(gstrings);
622 if (copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN))
623 goto out;
624 ret = 0;
625
626 out:
627 kfree(data);
628 return ret;
629}
630
631static int ethtool_phys_id(struct net_device *dev, void __user *useraddr)
632{
633 struct ethtool_value id;
634
635 if (!dev->ethtool_ops->phys_id)
636 return -EOPNOTSUPP;
637
638 if (copy_from_user(&id, useraddr, sizeof(id)))
639 return -EFAULT;
640
641 return dev->ethtool_ops->phys_id(dev, id.data);
642}
643
644static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
645{
646 struct ethtool_stats stats;
647 struct ethtool_ops *ops = dev->ethtool_ops;
648 u64 *data;
649 int ret;
650
651 if (!ops->get_ethtool_stats || !ops->get_stats_count)
652 return -EOPNOTSUPP;
653
654 if (copy_from_user(&stats, useraddr, sizeof(stats)))
655 return -EFAULT;
656
657 stats.n_stats = ops->get_stats_count(dev);
658 data = kmalloc(stats.n_stats * sizeof(u64), GFP_USER);
659 if (!data)
660 return -ENOMEM;
661
662 ops->get_ethtool_stats(dev, &stats, data);
663
664 ret = -EFAULT;
665 if (copy_to_user(useraddr, &stats, sizeof(stats)))
666 goto out;
667 useraddr += sizeof(stats);
668 if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64)))
669 goto out;
670 ret = 0;
671
672 out:
673 kfree(data);
674 return ret;
675}
676
677/* The main entry point in this file. Called from net/core/dev.c */
678
679int dev_ethtool(struct ifreq *ifr)
680{
681 struct net_device *dev = __dev_get_by_name(ifr->ifr_name);
682 void __user *useraddr = ifr->ifr_data;
683 u32 ethcmd;
684 int rc;
685
686 /*
687 * XXX: This can be pushed down into the ethtool_* handlers that
688 * need it. Keep existing behaviour for the moment.
689 */
690 if (!capable(CAP_NET_ADMIN))
691 return -EPERM;
692
693 if (!dev || !netif_device_present(dev))
694 return -ENODEV;
695
696 if (!dev->ethtool_ops)
697 goto ioctl;
698
699 if (copy_from_user(&ethcmd, useraddr, sizeof (ethcmd)))
700 return -EFAULT;
701
702 if(dev->ethtool_ops->begin)
703 if ((rc = dev->ethtool_ops->begin(dev)) < 0)
704 return rc;
705
706 switch (ethcmd) {
707 case ETHTOOL_GSET:
708 rc = ethtool_get_settings(dev, useraddr);
709 break;
710 case ETHTOOL_SSET:
711 rc = ethtool_set_settings(dev, useraddr);
712 break;
713 case ETHTOOL_GDRVINFO:
714 rc = ethtool_get_drvinfo(dev, useraddr);
715
716 break;
717 case ETHTOOL_GREGS:
718 rc = ethtool_get_regs(dev, useraddr);
719 break;
720 case ETHTOOL_GWOL:
721 rc = ethtool_get_wol(dev, useraddr);
722 break;
723 case ETHTOOL_SWOL:
724 rc = ethtool_set_wol(dev, useraddr);
725 break;
726 case ETHTOOL_GMSGLVL:
727 rc = ethtool_get_msglevel(dev, useraddr);
728 break;
729 case ETHTOOL_SMSGLVL:
730 rc = ethtool_set_msglevel(dev, useraddr);
731 break;
732 case ETHTOOL_NWAY_RST:
733 rc = ethtool_nway_reset(dev);
734 break;
735 case ETHTOOL_GLINK:
736 rc = ethtool_get_link(dev, useraddr);
737 break;
738 case ETHTOOL_GEEPROM:
739 rc = ethtool_get_eeprom(dev, useraddr);
740 break;
741 case ETHTOOL_SEEPROM:
742 rc = ethtool_set_eeprom(dev, useraddr);
743 break;
744 case ETHTOOL_GCOALESCE:
745 rc = ethtool_get_coalesce(dev, useraddr);
746 break;
747 case ETHTOOL_SCOALESCE:
748 rc = ethtool_set_coalesce(dev, useraddr);
749 break;
750 case ETHTOOL_GRINGPARAM:
751 rc = ethtool_get_ringparam(dev, useraddr);
752 break;
753 case ETHTOOL_SRINGPARAM:
754 rc = ethtool_set_ringparam(dev, useraddr);
755 break;
756 case ETHTOOL_GPAUSEPARAM:
757 rc = ethtool_get_pauseparam(dev, useraddr);
758 break;
759 case ETHTOOL_SPAUSEPARAM:
760 rc = ethtool_set_pauseparam(dev, useraddr);
761 break;
762 case ETHTOOL_GRXCSUM:
763 rc = ethtool_get_rx_csum(dev, useraddr);
764 break;
765 case ETHTOOL_SRXCSUM:
766 rc = ethtool_set_rx_csum(dev, useraddr);
767 break;
768 case ETHTOOL_GTXCSUM:
769 rc = ethtool_get_tx_csum(dev, useraddr);
770 break;
771 case ETHTOOL_STXCSUM:
772 rc = ethtool_set_tx_csum(dev, useraddr);
773 break;
774 case ETHTOOL_GSG:
775 rc = ethtool_get_sg(dev, useraddr);
776 break;
777 case ETHTOOL_SSG:
778 rc = ethtool_set_sg(dev, useraddr);
779 break;
780 case ETHTOOL_GTSO:
781 rc = ethtool_get_tso(dev, useraddr);
782 break;
783 case ETHTOOL_STSO:
784 rc = ethtool_set_tso(dev, useraddr);
785 break;
786 case ETHTOOL_TEST:
787 rc = ethtool_self_test(dev, useraddr);
788 break;
789 case ETHTOOL_GSTRINGS:
790 rc = ethtool_get_strings(dev, useraddr);
791 break;
792 case ETHTOOL_PHYS_ID:
793 rc = ethtool_phys_id(dev, useraddr);
794 break;
795 case ETHTOOL_GSTATS:
796 rc = ethtool_get_stats(dev, useraddr);
797 break;
798 default:
799 rc = -EOPNOTSUPP;
800 }
801
802 if(dev->ethtool_ops->complete)
803 dev->ethtool_ops->complete(dev);
804 return rc;
805
806 ioctl:
807 if (dev->do_ioctl)
808 return dev->do_ioctl(dev, ifr, SIOCETHTOOL);
809 return -EOPNOTSUPP;
810}
811
812EXPORT_SYMBOL(dev_ethtool);
813EXPORT_SYMBOL(ethtool_op_get_link);
814EXPORT_SYMBOL(ethtool_op_get_sg);
815EXPORT_SYMBOL(ethtool_op_get_tso);
816EXPORT_SYMBOL(ethtool_op_get_tx_csum);
817EXPORT_SYMBOL(ethtool_op_set_sg);
818EXPORT_SYMBOL(ethtool_op_set_tso);
819EXPORT_SYMBOL(ethtool_op_set_tx_csum);
diff --git a/net/core/filter.c b/net/core/filter.c
new file mode 100644
index 000000000000..f3b88205ace2
--- /dev/null
+++ b/net/core/filter.c
@@ -0,0 +1,432 @@
1/*
2 * Linux Socket Filter - Kernel level socket filtering
3 *
4 * Author:
5 * Jay Schulist <jschlst@samba.org>
6 *
7 * Based on the design of:
8 * - The Berkeley Packet Filter
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 *
15 * Andi Kleen - Fix a few bad bugs and races.
16 */
17
18#include <linux/module.h>
19#include <linux/types.h>
20#include <linux/sched.h>
21#include <linux/mm.h>
22#include <linux/fcntl.h>
23#include <linux/socket.h>
24#include <linux/in.h>
25#include <linux/inet.h>
26#include <linux/netdevice.h>
27#include <linux/if_packet.h>
28#include <net/ip.h>
29#include <net/protocol.h>
30#include <linux/skbuff.h>
31#include <net/sock.h>
32#include <linux/errno.h>
33#include <linux/timer.h>
34#include <asm/system.h>
35#include <asm/uaccess.h>
36#include <linux/filter.h>
37
38/* No hurry in this branch */
39static u8 *load_pointer(struct sk_buff *skb, int k)
40{
41 u8 *ptr = NULL;
42
43 if (k >= SKF_NET_OFF)
44 ptr = skb->nh.raw + k - SKF_NET_OFF;
45 else if (k >= SKF_LL_OFF)
46 ptr = skb->mac.raw + k - SKF_LL_OFF;
47
48 if (ptr >= skb->head && ptr < skb->tail)
49 return ptr;
50 return NULL;
51}
52
53/**
54 * sk_run_filter - run a filter on a socket
55 * @skb: buffer to run the filter on
56 * @filter: filter to apply
57 * @flen: length of filter
58 *
59 * Decode and apply filter instructions to the skb->data.
60 * Return length to keep, 0 for none. skb is the data we are
61 * filtering, filter is the array of filter instructions, and
62 * len is the number of filter blocks in the array.
63 */
64
65int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
66{
67 unsigned char *data = skb->data;
68 /* len is UNSIGNED. Byte wide insns relies only on implicit
69 type casts to prevent reading arbitrary memory locations.
70 */
71 unsigned int len = skb->len-skb->data_len;
72 struct sock_filter *fentry; /* We walk down these */
73 u32 A = 0; /* Accumulator */
74 u32 X = 0; /* Index Register */
75 u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */
76 int k;
77 int pc;
78
79 /*
80 * Process array of filter instructions.
81 */
82 for (pc = 0; pc < flen; pc++) {
83 fentry = &filter[pc];
84
85 switch (fentry->code) {
86 case BPF_ALU|BPF_ADD|BPF_X:
87 A += X;
88 continue;
89 case BPF_ALU|BPF_ADD|BPF_K:
90 A += fentry->k;
91 continue;
92 case BPF_ALU|BPF_SUB|BPF_X:
93 A -= X;
94 continue;
95 case BPF_ALU|BPF_SUB|BPF_K:
96 A -= fentry->k;
97 continue;
98 case BPF_ALU|BPF_MUL|BPF_X:
99 A *= X;
100 continue;
101 case BPF_ALU|BPF_MUL|BPF_K:
102 A *= fentry->k;
103 continue;
104 case BPF_ALU|BPF_DIV|BPF_X:
105 if (X == 0)
106 return 0;
107 A /= X;
108 continue;
109 case BPF_ALU|BPF_DIV|BPF_K:
110 if (fentry->k == 0)
111 return 0;
112 A /= fentry->k;
113 continue;
114 case BPF_ALU|BPF_AND|BPF_X:
115 A &= X;
116 continue;
117 case BPF_ALU|BPF_AND|BPF_K:
118 A &= fentry->k;
119 continue;
120 case BPF_ALU|BPF_OR|BPF_X:
121 A |= X;
122 continue;
123 case BPF_ALU|BPF_OR|BPF_K:
124 A |= fentry->k;
125 continue;
126 case BPF_ALU|BPF_LSH|BPF_X:
127 A <<= X;
128 continue;
129 case BPF_ALU|BPF_LSH|BPF_K:
130 A <<= fentry->k;
131 continue;
132 case BPF_ALU|BPF_RSH|BPF_X:
133 A >>= X;
134 continue;
135 case BPF_ALU|BPF_RSH|BPF_K:
136 A >>= fentry->k;
137 continue;
138 case BPF_ALU|BPF_NEG:
139 A = -A;
140 continue;
141 case BPF_JMP|BPF_JA:
142 pc += fentry->k;
143 continue;
144 case BPF_JMP|BPF_JGT|BPF_K:
145 pc += (A > fentry->k) ? fentry->jt : fentry->jf;
146 continue;
147 case BPF_JMP|BPF_JGE|BPF_K:
148 pc += (A >= fentry->k) ? fentry->jt : fentry->jf;
149 continue;
150 case BPF_JMP|BPF_JEQ|BPF_K:
151 pc += (A == fentry->k) ? fentry->jt : fentry->jf;
152 continue;
153 case BPF_JMP|BPF_JSET|BPF_K:
154 pc += (A & fentry->k) ? fentry->jt : fentry->jf;
155 continue;
156 case BPF_JMP|BPF_JGT|BPF_X:
157 pc += (A > X) ? fentry->jt : fentry->jf;
158 continue;
159 case BPF_JMP|BPF_JGE|BPF_X:
160 pc += (A >= X) ? fentry->jt : fentry->jf;
161 continue;
162 case BPF_JMP|BPF_JEQ|BPF_X:
163 pc += (A == X) ? fentry->jt : fentry->jf;
164 continue;
165 case BPF_JMP|BPF_JSET|BPF_X:
166 pc += (A & X) ? fentry->jt : fentry->jf;
167 continue;
168 case BPF_LD|BPF_W|BPF_ABS:
169 k = fentry->k;
170 load_w:
171 if (k >= 0 && (unsigned int)(k+sizeof(u32)) <= len) {
172 A = ntohl(*(u32*)&data[k]);
173 continue;
174 }
175 if (k < 0) {
176 u8 *ptr;
177
178 if (k >= SKF_AD_OFF)
179 break;
180 ptr = load_pointer(skb, k);
181 if (ptr) {
182 A = ntohl(*(u32*)ptr);
183 continue;
184 }
185 } else {
186 u32 _tmp, *p;
187 p = skb_header_pointer(skb, k, 4, &_tmp);
188 if (p != NULL) {
189 A = ntohl(*p);
190 continue;
191 }
192 }
193 return 0;
194 case BPF_LD|BPF_H|BPF_ABS:
195 k = fentry->k;
196 load_h:
197 if (k >= 0 && (unsigned int)(k + sizeof(u16)) <= len) {
198 A = ntohs(*(u16*)&data[k]);
199 continue;
200 }
201 if (k < 0) {
202 u8 *ptr;
203
204 if (k >= SKF_AD_OFF)
205 break;
206 ptr = load_pointer(skb, k);
207 if (ptr) {
208 A = ntohs(*(u16*)ptr);
209 continue;
210 }
211 } else {
212 u16 _tmp, *p;
213 p = skb_header_pointer(skb, k, 2, &_tmp);
214 if (p != NULL) {
215 A = ntohs(*p);
216 continue;
217 }
218 }
219 return 0;
220 case BPF_LD|BPF_B|BPF_ABS:
221 k = fentry->k;
222load_b:
223 if (k >= 0 && (unsigned int)k < len) {
224 A = data[k];
225 continue;
226 }
227 if (k < 0) {
228 u8 *ptr;
229
230 if (k >= SKF_AD_OFF)
231 break;
232 ptr = load_pointer(skb, k);
233 if (ptr) {
234 A = *ptr;
235 continue;
236 }
237 } else {
238 u8 _tmp, *p;
239 p = skb_header_pointer(skb, k, 1, &_tmp);
240 if (p != NULL) {
241 A = *p;
242 continue;
243 }
244 }
245 return 0;
246 case BPF_LD|BPF_W|BPF_LEN:
247 A = len;
248 continue;
249 case BPF_LDX|BPF_W|BPF_LEN:
250 X = len;
251 continue;
252 case BPF_LD|BPF_W|BPF_IND:
253 k = X + fentry->k;
254 goto load_w;
255 case BPF_LD|BPF_H|BPF_IND:
256 k = X + fentry->k;
257 goto load_h;
258 case BPF_LD|BPF_B|BPF_IND:
259 k = X + fentry->k;
260 goto load_b;
261 case BPF_LDX|BPF_B|BPF_MSH:
262 if (fentry->k >= len)
263 return 0;
264 X = (data[fentry->k] & 0xf) << 2;
265 continue;
266 case BPF_LD|BPF_IMM:
267 A = fentry->k;
268 continue;
269 case BPF_LDX|BPF_IMM:
270 X = fentry->k;
271 continue;
272 case BPF_LD|BPF_MEM:
273 A = mem[fentry->k];
274 continue;
275 case BPF_LDX|BPF_MEM:
276 X = mem[fentry->k];
277 continue;
278 case BPF_MISC|BPF_TAX:
279 X = A;
280 continue;
281 case BPF_MISC|BPF_TXA:
282 A = X;
283 continue;
284 case BPF_RET|BPF_K:
285 return ((unsigned int)fentry->k);
286 case BPF_RET|BPF_A:
287 return ((unsigned int)A);
288 case BPF_ST:
289 mem[fentry->k] = A;
290 continue;
291 case BPF_STX:
292 mem[fentry->k] = X;
293 continue;
294 default:
295 /* Invalid instruction counts as RET */
296 return 0;
297 }
298
299 /*
300 * Handle ancillary data, which are impossible
301 * (or very difficult) to get parsing packet contents.
302 */
303 switch (k-SKF_AD_OFF) {
304 case SKF_AD_PROTOCOL:
305 A = htons(skb->protocol);
306 continue;
307 case SKF_AD_PKTTYPE:
308 A = skb->pkt_type;
309 continue;
310 case SKF_AD_IFINDEX:
311 A = skb->dev->ifindex;
312 continue;
313 default:
314 return 0;
315 }
316 }
317
318 return 0;
319}
320
321/**
322 * sk_chk_filter - verify socket filter code
323 * @filter: filter to verify
324 * @flen: length of filter
325 *
326 * Check the user's filter code. If we let some ugly
327 * filter code slip through kaboom! The filter must contain
328 * no references or jumps that are out of range, no illegal instructions
329 * and no backward jumps. It must end with a RET instruction
330 *
331 * Returns 0 if the rule set is legal or a negative errno code if not.
332 */
333int sk_chk_filter(struct sock_filter *filter, int flen)
334{
335 struct sock_filter *ftest;
336 int pc;
337
338 if (((unsigned int)flen >= (~0U / sizeof(struct sock_filter))) || flen == 0)
339 return -EINVAL;
340
341 /* check the filter code now */
342 for (pc = 0; pc < flen; pc++) {
343 /* all jumps are forward as they are not signed */
344 ftest = &filter[pc];
345 if (BPF_CLASS(ftest->code) == BPF_JMP) {
346 /* but they mustn't jump off the end */
347 if (BPF_OP(ftest->code) == BPF_JA) {
348 /*
349 * Note, the large ftest->k might cause loops.
350 * Compare this with conditional jumps below,
351 * where offsets are limited. --ANK (981016)
352 */
353 if (ftest->k >= (unsigned)(flen-pc-1))
354 return -EINVAL;
355 } else {
356 /* for conditionals both must be safe */
357 if (pc + ftest->jt +1 >= flen ||
358 pc + ftest->jf +1 >= flen)
359 return -EINVAL;
360 }
361 }
362
363 /* check that memory operations use valid addresses. */
364 if (ftest->k >= BPF_MEMWORDS) {
365 /* but it might not be a memory operation... */
366 switch (ftest->code) {
367 case BPF_ST:
368 case BPF_STX:
369 case BPF_LD|BPF_MEM:
370 case BPF_LDX|BPF_MEM:
371 return -EINVAL;
372 }
373 }
374 }
375
376 /*
377 * The program must end with a return. We don't care where they
378 * jumped within the script (its always forwards) but in the end
379 * they _will_ hit this.
380 */
381 return (BPF_CLASS(filter[flen - 1].code) == BPF_RET) ? 0 : -EINVAL;
382}
383
384/**
385 * sk_attach_filter - attach a socket filter
386 * @fprog: the filter program
387 * @sk: the socket to use
388 *
389 * Attach the user's filter code. We first run some sanity checks on
390 * it to make sure it does not explode on us later. If an error
391 * occurs or there is insufficient memory for the filter a negative
392 * errno code is returned. On success the return is zero.
393 */
394int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
395{
396 struct sk_filter *fp;
397 unsigned int fsize = sizeof(struct sock_filter) * fprog->len;
398 int err;
399
400 /* Make sure new filter is there and in the right amounts. */
401 if (fprog->filter == NULL || fprog->len > BPF_MAXINSNS)
402 return -EINVAL;
403
404 fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL);
405 if (!fp)
406 return -ENOMEM;
407 if (copy_from_user(fp->insns, fprog->filter, fsize)) {
408 sock_kfree_s(sk, fp, fsize+sizeof(*fp));
409 return -EFAULT;
410 }
411
412 atomic_set(&fp->refcnt, 1);
413 fp->len = fprog->len;
414
415 err = sk_chk_filter(fp->insns, fp->len);
416 if (!err) {
417 struct sk_filter *old_fp;
418
419 spin_lock_bh(&sk->sk_lock.slock);
420 old_fp = sk->sk_filter;
421 sk->sk_filter = fp;
422 spin_unlock_bh(&sk->sk_lock.slock);
423 fp = old_fp;
424 }
425
426 if (fp)
427 sk_filter_release(sk, fp);
428 return err;
429}
430
431EXPORT_SYMBOL(sk_chk_filter);
432EXPORT_SYMBOL(sk_run_filter);
diff --git a/net/core/flow.c b/net/core/flow.c
new file mode 100644
index 000000000000..f289570b15a3
--- /dev/null
+++ b/net/core/flow.c
@@ -0,0 +1,371 @@
1/* flow.c: Generic flow cache.
2 *
3 * Copyright (C) 2003 Alexey N. Kuznetsov (kuznet@ms2.inr.ac.ru)
4 * Copyright (C) 2003 David S. Miller (davem@redhat.com)
5 */
6
7#include <linux/kernel.h>
8#include <linux/module.h>
9#include <linux/list.h>
10#include <linux/jhash.h>
11#include <linux/interrupt.h>
12#include <linux/mm.h>
13#include <linux/random.h>
14#include <linux/init.h>
15#include <linux/slab.h>
16#include <linux/smp.h>
17#include <linux/completion.h>
18#include <linux/percpu.h>
19#include <linux/bitops.h>
20#include <linux/notifier.h>
21#include <linux/cpu.h>
22#include <linux/cpumask.h>
23#include <net/flow.h>
24#include <asm/atomic.h>
25#include <asm/semaphore.h>
26
27struct flow_cache_entry {
28 struct flow_cache_entry *next;
29 u16 family;
30 u8 dir;
31 struct flowi key;
32 u32 genid;
33 void *object;
34 atomic_t *object_ref;
35};
36
37atomic_t flow_cache_genid = ATOMIC_INIT(0);
38
39static u32 flow_hash_shift;
40#define flow_hash_size (1 << flow_hash_shift)
41static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL };
42
43#define flow_table(cpu) (per_cpu(flow_tables, cpu))
44
45static kmem_cache_t *flow_cachep;
46
47static int flow_lwm, flow_hwm;
48
49struct flow_percpu_info {
50 int hash_rnd_recalc;
51 u32 hash_rnd;
52 int count;
53} ____cacheline_aligned;
54static DEFINE_PER_CPU(struct flow_percpu_info, flow_hash_info) = { 0 };
55
56#define flow_hash_rnd_recalc(cpu) \
57 (per_cpu(flow_hash_info, cpu).hash_rnd_recalc)
58#define flow_hash_rnd(cpu) \
59 (per_cpu(flow_hash_info, cpu).hash_rnd)
60#define flow_count(cpu) \
61 (per_cpu(flow_hash_info, cpu).count)
62
63static struct timer_list flow_hash_rnd_timer;
64
65#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ)
66
67struct flow_flush_info {
68 atomic_t cpuleft;
69 struct completion completion;
70};
71static DEFINE_PER_CPU(struct tasklet_struct, flow_flush_tasklets) = { NULL };
72
73#define flow_flush_tasklet(cpu) (&per_cpu(flow_flush_tasklets, cpu))
74
75static void flow_cache_new_hashrnd(unsigned long arg)
76{
77 int i;
78
79 for_each_cpu(i)
80 flow_hash_rnd_recalc(i) = 1;
81
82 flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
83 add_timer(&flow_hash_rnd_timer);
84}
85
86static void __flow_cache_shrink(int cpu, int shrink_to)
87{
88 struct flow_cache_entry *fle, **flp;
89 int i;
90
91 for (i = 0; i < flow_hash_size; i++) {
92 int k = 0;
93
94 flp = &flow_table(cpu)[i];
95 while ((fle = *flp) != NULL && k < shrink_to) {
96 k++;
97 flp = &fle->next;
98 }
99 while ((fle = *flp) != NULL) {
100 *flp = fle->next;
101 if (fle->object)
102 atomic_dec(fle->object_ref);
103 kmem_cache_free(flow_cachep, fle);
104 flow_count(cpu)--;
105 }
106 }
107}
108
109static void flow_cache_shrink(int cpu)
110{
111 int shrink_to = flow_lwm / flow_hash_size;
112
113 __flow_cache_shrink(cpu, shrink_to);
114}
115
116static void flow_new_hash_rnd(int cpu)
117{
118 get_random_bytes(&flow_hash_rnd(cpu), sizeof(u32));
119 flow_hash_rnd_recalc(cpu) = 0;
120
121 __flow_cache_shrink(cpu, 0);
122}
123
124static u32 flow_hash_code(struct flowi *key, int cpu)
125{
126 u32 *k = (u32 *) key;
127
128 return (jhash2(k, (sizeof(*key) / sizeof(u32)), flow_hash_rnd(cpu)) &
129 (flow_hash_size - 1));
130}
131
132#if (BITS_PER_LONG == 64)
133typedef u64 flow_compare_t;
134#else
135typedef u32 flow_compare_t;
136#endif
137
138extern void flowi_is_missized(void);
139
140/* I hear what you're saying, use memcmp. But memcmp cannot make
141 * important assumptions that we can here, such as alignment and
142 * constant size.
143 */
144static int flow_key_compare(struct flowi *key1, struct flowi *key2)
145{
146 flow_compare_t *k1, *k1_lim, *k2;
147 const int n_elem = sizeof(struct flowi) / sizeof(flow_compare_t);
148
149 if (sizeof(struct flowi) % sizeof(flow_compare_t))
150 flowi_is_missized();
151
152 k1 = (flow_compare_t *) key1;
153 k1_lim = k1 + n_elem;
154
155 k2 = (flow_compare_t *) key2;
156
157 do {
158 if (*k1++ != *k2++)
159 return 1;
160 } while (k1 < k1_lim);
161
162 return 0;
163}
164
165void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir,
166 flow_resolve_t resolver)
167{
168 struct flow_cache_entry *fle, **head;
169 unsigned int hash;
170 int cpu;
171
172 local_bh_disable();
173 cpu = smp_processor_id();
174
175 fle = NULL;
176 /* Packet really early in init? Making flow_cache_init a
177 * pre-smp initcall would solve this. --RR */
178 if (!flow_table(cpu))
179 goto nocache;
180
181 if (flow_hash_rnd_recalc(cpu))
182 flow_new_hash_rnd(cpu);
183 hash = flow_hash_code(key, cpu);
184
185 head = &flow_table(cpu)[hash];
186 for (fle = *head; fle; fle = fle->next) {
187 if (fle->family == family &&
188 fle->dir == dir &&
189 flow_key_compare(key, &fle->key) == 0) {
190 if (fle->genid == atomic_read(&flow_cache_genid)) {
191 void *ret = fle->object;
192
193 if (ret)
194 atomic_inc(fle->object_ref);
195 local_bh_enable();
196
197 return ret;
198 }
199 break;
200 }
201 }
202
203 if (!fle) {
204 if (flow_count(cpu) > flow_hwm)
205 flow_cache_shrink(cpu);
206
207 fle = kmem_cache_alloc(flow_cachep, SLAB_ATOMIC);
208 if (fle) {
209 fle->next = *head;
210 *head = fle;
211 fle->family = family;
212 fle->dir = dir;
213 memcpy(&fle->key, key, sizeof(*key));
214 fle->object = NULL;
215 flow_count(cpu)++;
216 }
217 }
218
219nocache:
220 {
221 void *obj;
222 atomic_t *obj_ref;
223
224 resolver(key, family, dir, &obj, &obj_ref);
225
226 if (fle) {
227 fle->genid = atomic_read(&flow_cache_genid);
228
229 if (fle->object)
230 atomic_dec(fle->object_ref);
231
232 fle->object = obj;
233 fle->object_ref = obj_ref;
234 if (obj)
235 atomic_inc(fle->object_ref);
236 }
237 local_bh_enable();
238
239 return obj;
240 }
241}
242
243static void flow_cache_flush_tasklet(unsigned long data)
244{
245 struct flow_flush_info *info = (void *)data;
246 int i;
247 int cpu;
248
249 cpu = smp_processor_id();
250 for (i = 0; i < flow_hash_size; i++) {
251 struct flow_cache_entry *fle;
252
253 fle = flow_table(cpu)[i];
254 for (; fle; fle = fle->next) {
255 unsigned genid = atomic_read(&flow_cache_genid);
256
257 if (!fle->object || fle->genid == genid)
258 continue;
259
260 fle->object = NULL;
261 atomic_dec(fle->object_ref);
262 }
263 }
264
265 if (atomic_dec_and_test(&info->cpuleft))
266 complete(&info->completion);
267}
268
269static void flow_cache_flush_per_cpu(void *) __attribute__((__unused__));
270static void flow_cache_flush_per_cpu(void *data)
271{
272 struct flow_flush_info *info = data;
273 int cpu;
274 struct tasklet_struct *tasklet;
275
276 cpu = smp_processor_id();
277
278 tasklet = flow_flush_tasklet(cpu);
279 tasklet->data = (unsigned long)info;
280 tasklet_schedule(tasklet);
281}
282
283void flow_cache_flush(void)
284{
285 struct flow_flush_info info;
286 static DECLARE_MUTEX(flow_flush_sem);
287
288 /* Don't want cpus going down or up during this. */
289 lock_cpu_hotplug();
290 down(&flow_flush_sem);
291 atomic_set(&info.cpuleft, num_online_cpus());
292 init_completion(&info.completion);
293
294 local_bh_disable();
295 smp_call_function(flow_cache_flush_per_cpu, &info, 1, 0);
296 flow_cache_flush_tasklet((unsigned long)&info);
297 local_bh_enable();
298
299 wait_for_completion(&info.completion);
300 up(&flow_flush_sem);
301 unlock_cpu_hotplug();
302}
303
304static void __devinit flow_cache_cpu_prepare(int cpu)
305{
306 struct tasklet_struct *tasklet;
307 unsigned long order;
308
309 for (order = 0;
310 (PAGE_SIZE << order) <
311 (sizeof(struct flow_cache_entry *)*flow_hash_size);
312 order++)
313 /* NOTHING */;
314
315 flow_table(cpu) = (struct flow_cache_entry **)
316 __get_free_pages(GFP_KERNEL, order);
317 if (!flow_table(cpu))
318 panic("NET: failed to allocate flow cache order %lu\n", order);
319
320 memset(flow_table(cpu), 0, PAGE_SIZE << order);
321
322 flow_hash_rnd_recalc(cpu) = 1;
323 flow_count(cpu) = 0;
324
325 tasklet = flow_flush_tasklet(cpu);
326 tasklet_init(tasklet, flow_cache_flush_tasklet, 0);
327}
328
329#ifdef CONFIG_HOTPLUG_CPU
330static int flow_cache_cpu(struct notifier_block *nfb,
331 unsigned long action,
332 void *hcpu)
333{
334 if (action == CPU_DEAD)
335 __flow_cache_shrink((unsigned long)hcpu, 0);
336 return NOTIFY_OK;
337}
338#endif /* CONFIG_HOTPLUG_CPU */
339
340static int __init flow_cache_init(void)
341{
342 int i;
343
344 flow_cachep = kmem_cache_create("flow_cache",
345 sizeof(struct flow_cache_entry),
346 0, SLAB_HWCACHE_ALIGN,
347 NULL, NULL);
348
349 if (!flow_cachep)
350 panic("NET: failed to allocate flow cache slab\n");
351
352 flow_hash_shift = 10;
353 flow_lwm = 2 * flow_hash_size;
354 flow_hwm = 4 * flow_hash_size;
355
356 init_timer(&flow_hash_rnd_timer);
357 flow_hash_rnd_timer.function = flow_cache_new_hashrnd;
358 flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
359 add_timer(&flow_hash_rnd_timer);
360
361 for_each_cpu(i)
362 flow_cache_cpu_prepare(i);
363
364 hotcpu_notifier(flow_cache_cpu, 0);
365 return 0;
366}
367
368module_init(flow_cache_init);
369
370EXPORT_SYMBOL(flow_cache_genid);
371EXPORT_SYMBOL(flow_cache_lookup);
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
new file mode 100644
index 000000000000..b07c029e8219
--- /dev/null
+++ b/net/core/gen_estimator.c
@@ -0,0 +1,250 @@
1/*
2 * net/sched/gen_estimator.c Simple rate estimator.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Changes:
12 * Jamal Hadi Salim - moved it to net/core and reshulfed
13 * names to make it usable in general net subsystem.
14 */
15
16#include <asm/uaccess.h>
17#include <asm/system.h>
18#include <asm/bitops.h>
19#include <linux/module.h>
20#include <linux/types.h>
21#include <linux/kernel.h>
22#include <linux/jiffies.h>
23#include <linux/string.h>
24#include <linux/mm.h>
25#include <linux/socket.h>
26#include <linux/sockios.h>
27#include <linux/in.h>
28#include <linux/errno.h>
29#include <linux/interrupt.h>
30#include <linux/netdevice.h>
31#include <linux/skbuff.h>
32#include <linux/rtnetlink.h>
33#include <linux/init.h>
34#include <net/sock.h>
35#include <net/gen_stats.h>
36
37/*
38 This code is NOT intended to be used for statistics collection,
39 its purpose is to provide a base for statistical multiplexing
40 for controlled load service.
41 If you need only statistics, run a user level daemon which
42 periodically reads byte counters.
43
44 Unfortunately, rate estimation is not a very easy task.
45 F.e. I did not find a simple way to estimate the current peak rate
46 and even failed to formulate the problem 8)8)
47
48 So I preferred not to built an estimator into the scheduler,
49 but run this task separately.
50 Ideally, it should be kernel thread(s), but for now it runs
51 from timers, which puts apparent top bounds on the number of rated
52 flows, has minimal overhead on small, but is enough
53 to handle controlled load service, sets of aggregates.
54
55 We measure rate over A=(1<<interval) seconds and evaluate EWMA:
56
57 avrate = avrate*(1-W) + rate*W
58
59 where W is chosen as negative power of 2: W = 2^(-ewma_log)
60
61 The resulting time constant is:
62
63 T = A/(-ln(1-W))
64
65
66 NOTES.
67
68 * The stored value for avbps is scaled by 2^5, so that maximal
69 rate is ~1Gbit, avpps is scaled by 2^10.
70
71 * Minimal interval is HZ/4=250msec (it is the greatest common divisor
72 for HZ=100 and HZ=1024 8)), maximal interval
73 is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals
74 are too expensive, longer ones can be implemented
75 at user level painlessly.
76 */
77
78#define EST_MAX_INTERVAL 5
79
80struct gen_estimator
81{
82 struct gen_estimator *next;
83 struct gnet_stats_basic *bstats;
84 struct gnet_stats_rate_est *rate_est;
85 spinlock_t *stats_lock;
86 unsigned interval;
87 int ewma_log;
88 u64 last_bytes;
89 u32 last_packets;
90 u32 avpps;
91 u32 avbps;
92};
93
94struct gen_estimator_head
95{
96 struct timer_list timer;
97 struct gen_estimator *list;
98};
99
100static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
101
102/* Estimator array lock */
103static DEFINE_RWLOCK(est_lock);
104
105static void est_timer(unsigned long arg)
106{
107 int idx = (int)arg;
108 struct gen_estimator *e;
109
110 read_lock(&est_lock);
111 for (e = elist[idx].list; e; e = e->next) {
112 u64 nbytes;
113 u32 npackets;
114 u32 rate;
115
116 spin_lock(e->stats_lock);
117 nbytes = e->bstats->bytes;
118 npackets = e->bstats->packets;
119 rate = (nbytes - e->last_bytes)<<(7 - idx);
120 e->last_bytes = nbytes;
121 e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log;
122 e->rate_est->bps = (e->avbps+0xF)>>5;
123
124 rate = (npackets - e->last_packets)<<(12 - idx);
125 e->last_packets = npackets;
126 e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log;
127 e->rate_est->pps = (e->avpps+0x1FF)>>10;
128 spin_unlock(e->stats_lock);
129 }
130
131 mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4));
132 read_unlock(&est_lock);
133}
134
135/**
136 * gen_new_estimator - create a new rate estimator
137 * @bstats: basic statistics
138 * @rate_est: rate estimator statistics
139 * @stats_lock: statistics lock
140 * @opt: rate estimator configuration TLV
141 *
142 * Creates a new rate estimator with &bstats as source and &rate_est
143 * as destination. A new timer with the interval specified in the
144 * configuration TLV is created. Upon each interval, the latest statistics
145 * will be read from &bstats and the estimated rate will be stored in
146 * &rate_est with the statistics lock grabed during this period.
147 *
148 * Returns 0 on success or a negative error code.
149 */
150int gen_new_estimator(struct gnet_stats_basic *bstats,
151 struct gnet_stats_rate_est *rate_est, spinlock_t *stats_lock, struct rtattr *opt)
152{
153 struct gen_estimator *est;
154 struct gnet_estimator *parm = RTA_DATA(opt);
155
156 if (RTA_PAYLOAD(opt) < sizeof(*parm))
157 return -EINVAL;
158
159 if (parm->interval < -2 || parm->interval > 3)
160 return -EINVAL;
161
162 est = kmalloc(sizeof(*est), GFP_KERNEL);
163 if (est == NULL)
164 return -ENOBUFS;
165
166 memset(est, 0, sizeof(*est));
167 est->interval = parm->interval + 2;
168 est->bstats = bstats;
169 est->rate_est = rate_est;
170 est->stats_lock = stats_lock;
171 est->ewma_log = parm->ewma_log;
172 est->last_bytes = bstats->bytes;
173 est->avbps = rate_est->bps<<5;
174 est->last_packets = bstats->packets;
175 est->avpps = rate_est->pps<<10;
176
177 est->next = elist[est->interval].list;
178 if (est->next == NULL) {
179 init_timer(&elist[est->interval].timer);
180 elist[est->interval].timer.data = est->interval;
181 elist[est->interval].timer.expires = jiffies + ((HZ<<est->interval)/4);
182 elist[est->interval].timer.function = est_timer;
183 add_timer(&elist[est->interval].timer);
184 }
185 write_lock_bh(&est_lock);
186 elist[est->interval].list = est;
187 write_unlock_bh(&est_lock);
188 return 0;
189}
190
191/**
192 * gen_kill_estimator - remove a rate estimator
193 * @bstats: basic statistics
194 * @rate_est: rate estimator statistics
195 *
196 * Removes the rate estimator specified by &bstats and &rate_est
197 * and deletes the timer.
198 */
199void gen_kill_estimator(struct gnet_stats_basic *bstats,
200 struct gnet_stats_rate_est *rate_est)
201{
202 int idx;
203 struct gen_estimator *est, **pest;
204
205 for (idx=0; idx <= EST_MAX_INTERVAL; idx++) {
206 int killed = 0;
207 pest = &elist[idx].list;
208 while ((est=*pest) != NULL) {
209 if (est->rate_est != rate_est || est->bstats != bstats) {
210 pest = &est->next;
211 continue;
212 }
213
214 write_lock_bh(&est_lock);
215 *pest = est->next;
216 write_unlock_bh(&est_lock);
217
218 kfree(est);
219 killed++;
220 }
221 if (killed && elist[idx].list == NULL)
222 del_timer(&elist[idx].timer);
223 }
224}
225
226/**
227 * gen_replace_estimator - replace rate estimator configruation
228 * @bstats: basic statistics
229 * @rate_est: rate estimator statistics
230 * @stats_lock: statistics lock
231 * @opt: rate estimator configuration TLV
232 *
233 * Replaces the configuration of a rate estimator by calling
234 * gen_kill_estimator() and gen_new_estimator().
235 *
236 * Returns 0 on success or a negative error code.
237 */
238int
239gen_replace_estimator(struct gnet_stats_basic *bstats,
240 struct gnet_stats_rate_est *rate_est, spinlock_t *stats_lock,
241 struct rtattr *opt)
242{
243 gen_kill_estimator(bstats, rate_est);
244 return gen_new_estimator(bstats, rate_est, stats_lock, opt);
245}
246
247
248EXPORT_SYMBOL(gen_kill_estimator);
249EXPORT_SYMBOL(gen_new_estimator);
250EXPORT_SYMBOL(gen_replace_estimator);
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
new file mode 100644
index 000000000000..8f21490355fa
--- /dev/null
+++ b/net/core/gen_stats.c
@@ -0,0 +1,239 @@
1/*
2 * net/core/gen_stats.c
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Thomas Graf <tgraf@suug.ch>
10 * Jamal Hadi Salim
11 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
12 *
13 * See Documentation/networking/gen_stats.txt
14 */
15
16#include <linux/types.h>
17#include <linux/kernel.h>
18#include <linux/module.h>
19#include <linux/interrupt.h>
20#include <linux/socket.h>
21#include <linux/rtnetlink.h>
22#include <linux/gen_stats.h>
23#include <net/gen_stats.h>
24
25
26static inline int
27gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size)
28{
29 RTA_PUT(d->skb, type, size, buf);
30 return 0;
31
32rtattr_failure:
33 spin_unlock_bh(d->lock);
34 return -1;
35}
36
37/**
38 * gnet_stats_start_copy_compat - start dumping procedure in compatibility mode
39 * @skb: socket buffer to put statistics TLVs into
40 * @type: TLV type for top level statistic TLV
41 * @tc_stats_type: TLV type for backward compatibility struct tc_stats TLV
42 * @xstats_type: TLV type for backward compatibility xstats TLV
43 * @lock: statistics lock
44 * @d: dumping handle
45 *
46 * Initializes the dumping handle, grabs the statistic lock and appends
47 * an empty TLV header to the socket buffer for use a container for all
48 * other statistic TLVS.
49 *
50 * The dumping handle is marked to be in backward compatibility mode telling
51 * all gnet_stats_copy_XXX() functions to fill a local copy of struct tc_stats.
52 *
53 * Returns 0 on success or -1 if the room in the socket buffer was not sufficient.
54 */
55int
56gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type,
57 int xstats_type, spinlock_t *lock, struct gnet_dump *d)
58{
59 memset(d, 0, sizeof(*d));
60
61 spin_lock_bh(lock);
62 d->lock = lock;
63 if (type)
64 d->tail = (struct rtattr *) skb->tail;
65 d->skb = skb;
66 d->compat_tc_stats = tc_stats_type;
67 d->compat_xstats = xstats_type;
68
69 if (d->tail)
70 return gnet_stats_copy(d, type, NULL, 0);
71
72 return 0;
73}
74
75/**
76 * gnet_stats_start_copy_compat - start dumping procedure in compatibility mode
77 * @skb: socket buffer to put statistics TLVs into
78 * @type: TLV type for top level statistic TLV
79 * @lock: statistics lock
80 * @d: dumping handle
81 *
82 * Initializes the dumping handle, grabs the statistic lock and appends
83 * an empty TLV header to the socket buffer for use a container for all
84 * other statistic TLVS.
85 *
86 * Returns 0 on success or -1 if the room in the socket buffer was not sufficient.
87 */
88int
89gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock,
90 struct gnet_dump *d)
91{
92 return gnet_stats_start_copy_compat(skb, type, 0, 0, lock, d);
93}
94
95/**
96 * gnet_stats_copy_basic - copy basic statistics into statistic TLV
97 * @d: dumping handle
98 * @b: basic statistics
99 *
100 * Appends the basic statistics to the top level TLV created by
101 * gnet_stats_start_copy().
102 *
103 * Returns 0 on success or -1 with the statistic lock released
104 * if the room in the socket buffer was not sufficient.
105 */
106int
107gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic *b)
108{
109 if (d->compat_tc_stats) {
110 d->tc_stats.bytes = b->bytes;
111 d->tc_stats.packets = b->packets;
112 }
113
114 if (d->tail)
115 return gnet_stats_copy(d, TCA_STATS_BASIC, b, sizeof(*b));
116
117 return 0;
118}
119
120/**
121 * gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV
122 * @d: dumping handle
123 * @r: rate estimator statistics
124 *
125 * Appends the rate estimator statistics to the top level TLV created by
126 * gnet_stats_start_copy().
127 *
128 * Returns 0 on success or -1 with the statistic lock released
129 * if the room in the socket buffer was not sufficient.
130 */
131int
132gnet_stats_copy_rate_est(struct gnet_dump *d, struct gnet_stats_rate_est *r)
133{
134 if (d->compat_tc_stats) {
135 d->tc_stats.bps = r->bps;
136 d->tc_stats.pps = r->pps;
137 }
138
139 if (d->tail)
140 return gnet_stats_copy(d, TCA_STATS_RATE_EST, r, sizeof(*r));
141
142 return 0;
143}
144
145/**
146 * gnet_stats_copy_queue - copy queue statistics into statistics TLV
147 * @d: dumping handle
148 * @q: queue statistics
149 *
150 * Appends the queue statistics to the top level TLV created by
151 * gnet_stats_start_copy().
152 *
153 * Returns 0 on success or -1 with the statistic lock released
154 * if the room in the socket buffer was not sufficient.
155 */
156int
157gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue *q)
158{
159 if (d->compat_tc_stats) {
160 d->tc_stats.drops = q->drops;
161 d->tc_stats.qlen = q->qlen;
162 d->tc_stats.backlog = q->backlog;
163 d->tc_stats.overlimits = q->overlimits;
164 }
165
166 if (d->tail)
167 return gnet_stats_copy(d, TCA_STATS_QUEUE, q, sizeof(*q));
168
169 return 0;
170}
171
172/**
173 * gnet_stats_copy_app - copy application specific statistics into statistics TLV
174 * @d: dumping handle
175 * @st: application specific statistics data
176 * @len: length of data
177 *
178 * Appends the application sepecific statistics to the top level TLV created by
179 * gnet_stats_start_copy() and remembers the data for XSTATS if the dumping
180 * handle is in backward compatibility mode.
181 *
182 * Returns 0 on success or -1 with the statistic lock released
183 * if the room in the socket buffer was not sufficient.
184 */
185int
186gnet_stats_copy_app(struct gnet_dump *d, void *st, int len)
187{
188 if (d->compat_xstats) {
189 d->xstats = st;
190 d->xstats_len = len;
191 }
192
193 if (d->tail)
194 return gnet_stats_copy(d, TCA_STATS_APP, st, len);
195
196 return 0;
197}
198
199/**
200 * gnet_stats_finish_copy - finish dumping procedure
201 * @d: dumping handle
202 *
203 * Corrects the length of the top level TLV to include all TLVs added
204 * by gnet_stats_copy_XXX() calls. Adds the backward compatibility TLVs
205 * if gnet_stats_start_copy_compat() was used and releases the statistics
206 * lock.
207 *
208 * Returns 0 on success or -1 with the statistic lock released
209 * if the room in the socket buffer was not sufficient.
210 */
211int
212gnet_stats_finish_copy(struct gnet_dump *d)
213{
214 if (d->tail)
215 d->tail->rta_len = d->skb->tail - (u8 *) d->tail;
216
217 if (d->compat_tc_stats)
218 if (gnet_stats_copy(d, d->compat_tc_stats, &d->tc_stats,
219 sizeof(d->tc_stats)) < 0)
220 return -1;
221
222 if (d->compat_xstats && d->xstats) {
223 if (gnet_stats_copy(d, d->compat_xstats, d->xstats,
224 d->xstats_len) < 0)
225 return -1;
226 }
227
228 spin_unlock_bh(d->lock);
229 return 0;
230}
231
232
233EXPORT_SYMBOL(gnet_stats_start_copy);
234EXPORT_SYMBOL(gnet_stats_start_copy_compat);
235EXPORT_SYMBOL(gnet_stats_copy_basic);
236EXPORT_SYMBOL(gnet_stats_copy_rate_est);
237EXPORT_SYMBOL(gnet_stats_copy_queue);
238EXPORT_SYMBOL(gnet_stats_copy_app);
239EXPORT_SYMBOL(gnet_stats_finish_copy);
diff --git a/net/core/iovec.c b/net/core/iovec.c
new file mode 100644
index 000000000000..d57ace949ab8
--- /dev/null
+++ b/net/core/iovec.c
@@ -0,0 +1,239 @@
1/*
2 * iovec manipulation routines.
3 *
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version
8 * 2 of the License, or (at your option) any later version.
9 *
10 * Fixes:
11 * Andrew Lunn : Errors in iovec copying.
12 * Pedro Roque : Added memcpy_fromiovecend and
13 * csum_..._fromiovecend.
14 * Andi Kleen : fixed error handling for 2.1
15 * Alexey Kuznetsov: 2.1 optimisations
16 * Andi Kleen : Fix csum*fromiovecend for IPv6.
17 */
18
19#include <linux/errno.h>
20#include <linux/module.h>
21#include <linux/sched.h>
22#include <linux/kernel.h>
23#include <linux/mm.h>
24#include <linux/slab.h>
25#include <linux/net.h>
26#include <linux/in6.h>
27#include <asm/uaccess.h>
28#include <asm/byteorder.h>
29#include <net/checksum.h>
30#include <net/sock.h>
31
32/*
33 * Verify iovec. The caller must ensure that the iovec is big enough
34 * to hold the message iovec.
35 *
36 * Save time not doing verify_area. copy_*_user will make this work
37 * in any case.
38 */
39
40int verify_iovec(struct msghdr *m, struct iovec *iov, char *address, int mode)
41{
42 int size, err, ct;
43
44 if (m->msg_namelen) {
45 if (mode == VERIFY_READ) {
46 err = move_addr_to_kernel(m->msg_name, m->msg_namelen,
47 address);
48 if (err < 0)
49 return err;
50 }
51 m->msg_name = address;
52 } else {
53 m->msg_name = NULL;
54 }
55
56 size = m->msg_iovlen * sizeof(struct iovec);
57 if (copy_from_user(iov, m->msg_iov, size))
58 return -EFAULT;
59
60 m->msg_iov = iov;
61 err = 0;
62
63 for (ct = 0; ct < m->msg_iovlen; ct++) {
64 err += iov[ct].iov_len;
65 /*
66 * Goal is not to verify user data, but to prevent returning
67 * negative value, which is interpreted as errno.
68 * Overflow is still possible, but it is harmless.
69 */
70 if (err < 0)
71 return -EMSGSIZE;
72 }
73
74 return err;
75}
76
77/*
78 * Copy kernel to iovec. Returns -EFAULT on error.
79 *
80 * Note: this modifies the original iovec.
81 */
82
83int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len)
84{
85 while (len > 0) {
86 if (iov->iov_len) {
87 int copy = min_t(unsigned int, iov->iov_len, len);
88 if (copy_to_user(iov->iov_base, kdata, copy))
89 return -EFAULT;
90 kdata += copy;
91 len -= copy;
92 iov->iov_len -= copy;
93 iov->iov_base += copy;
94 }
95 iov++;
96 }
97
98 return 0;
99}
100
101/*
102 * Copy iovec to kernel. Returns -EFAULT on error.
103 *
104 * Note: this modifies the original iovec.
105 */
106
107int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len)
108{
109 while (len > 0) {
110 if (iov->iov_len) {
111 int copy = min_t(unsigned int, len, iov->iov_len);
112 if (copy_from_user(kdata, iov->iov_base, copy))
113 return -EFAULT;
114 len -= copy;
115 kdata += copy;
116 iov->iov_base += copy;
117 iov->iov_len -= copy;
118 }
119 iov++;
120 }
121
122 return 0;
123}
124
125/*
126 * For use with ip_build_xmit
127 */
128int memcpy_fromiovecend(unsigned char *kdata, struct iovec *iov, int offset,
129 int len)
130{
131 /* Skip over the finished iovecs */
132 while (offset >= iov->iov_len) {
133 offset -= iov->iov_len;
134 iov++;
135 }
136
137 while (len > 0) {
138 u8 __user *base = iov->iov_base + offset;
139 int copy = min_t(unsigned int, len, iov->iov_len - offset);
140
141 offset = 0;
142 if (copy_from_user(kdata, base, copy))
143 return -EFAULT;
144 len -= copy;
145 kdata += copy;
146 iov++;
147 }
148
149 return 0;
150}
151
152/*
153 * And now for the all-in-one: copy and checksum from a user iovec
154 * directly to a datagram
155 * Calls to csum_partial but the last must be in 32 bit chunks
156 *
157 * ip_build_xmit must ensure that when fragmenting only the last
158 * call to this function will be unaligned also.
159 */
160int csum_partial_copy_fromiovecend(unsigned char *kdata, struct iovec *iov,
161 int offset, unsigned int len, int *csump)
162{
163 int csum = *csump;
164 int partial_cnt = 0, err = 0;
165
166 /* Skip over the finished iovecs */
167 while (offset >= iov->iov_len) {
168 offset -= iov->iov_len;
169 iov++;
170 }
171
172 while (len > 0) {
173 u8 __user *base = iov->iov_base + offset;
174 int copy = min_t(unsigned int, len, iov->iov_len - offset);
175
176 offset = 0;
177
178 /* There is a remnant from previous iov. */
179 if (partial_cnt) {
180 int par_len = 4 - partial_cnt;
181
182 /* iov component is too short ... */
183 if (par_len > copy) {
184 if (copy_from_user(kdata, base, copy))
185 goto out_fault;
186 kdata += copy;
187 base += copy;
188 partial_cnt += copy;
189 len -= copy;
190 iov++;
191 if (len)
192 continue;
193 *csump = csum_partial(kdata - partial_cnt,
194 partial_cnt, csum);
195 goto out;
196 }
197 if (copy_from_user(kdata, base, par_len))
198 goto out_fault;
199 csum = csum_partial(kdata - partial_cnt, 4, csum);
200 kdata += par_len;
201 base += par_len;
202 copy -= par_len;
203 len -= par_len;
204 partial_cnt = 0;
205 }
206
207 if (len > copy) {
208 partial_cnt = copy % 4;
209 if (partial_cnt) {
210 copy -= partial_cnt;
211 if (copy_from_user(kdata + copy, base + copy,
212 partial_cnt))
213 goto out_fault;
214 }
215 }
216
217 if (copy) {
218 csum = csum_and_copy_from_user(base, kdata, copy,
219 csum, &err);
220 if (err)
221 goto out;
222 }
223 len -= copy + partial_cnt;
224 kdata += copy + partial_cnt;
225 iov++;
226 }
227 *csump = csum;
228out:
229 return err;
230
231out_fault:
232 err = -EFAULT;
233 goto out;
234}
235
236EXPORT_SYMBOL(csum_partial_copy_fromiovecend);
237EXPORT_SYMBOL(memcpy_fromiovec);
238EXPORT_SYMBOL(memcpy_fromiovecend);
239EXPORT_SYMBOL(memcpy_toiovec);
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
new file mode 100644
index 000000000000..4859b7446c6f
--- /dev/null
+++ b/net/core/link_watch.c
@@ -0,0 +1,137 @@
1/*
2 * Linux network device link state notification
3 *
4 * Author:
5 * Stefan Rompf <sux@loplof.de>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 */
13
14#include <linux/config.h>
15#include <linux/module.h>
16#include <linux/netdevice.h>
17#include <linux/if.h>
18#include <net/sock.h>
19#include <linux/rtnetlink.h>
20#include <linux/jiffies.h>
21#include <linux/spinlock.h>
22#include <linux/list.h>
23#include <linux/slab.h>
24#include <linux/workqueue.h>
25#include <linux/bitops.h>
26#include <asm/types.h>
27
28
29enum lw_bits {
30 LW_RUNNING = 0,
31 LW_SE_USED
32};
33
34static unsigned long linkwatch_flags;
35static unsigned long linkwatch_nextevent;
36
37static void linkwatch_event(void *dummy);
38static DECLARE_WORK(linkwatch_work, linkwatch_event, NULL);
39
40static LIST_HEAD(lweventlist);
41static DEFINE_SPINLOCK(lweventlist_lock);
42
43struct lw_event {
44 struct list_head list;
45 struct net_device *dev;
46};
47
48/* Avoid kmalloc() for most systems */
49static struct lw_event singleevent;
50
51/* Must be called with the rtnl semaphore held */
52void linkwatch_run_queue(void)
53{
54 LIST_HEAD(head);
55 struct list_head *n, *next;
56
57 spin_lock_irq(&lweventlist_lock);
58 list_splice_init(&lweventlist, &head);
59 spin_unlock_irq(&lweventlist_lock);
60
61 list_for_each_safe(n, next, &head) {
62 struct lw_event *event = list_entry(n, struct lw_event, list);
63 struct net_device *dev = event->dev;
64
65 if (event == &singleevent) {
66 clear_bit(LW_SE_USED, &linkwatch_flags);
67 } else {
68 kfree(event);
69 }
70
71 /* We are about to handle this device,
72 * so new events can be accepted
73 */
74 clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state);
75
76 if (dev->flags & IFF_UP) {
77 netdev_state_change(dev);
78 }
79
80 dev_put(dev);
81 }
82}
83
84
85static void linkwatch_event(void *dummy)
86{
87 /* Limit the number of linkwatch events to one
88 * per second so that a runaway driver does not
89 * cause a storm of messages on the netlink
90 * socket
91 */
92 linkwatch_nextevent = jiffies + HZ;
93 clear_bit(LW_RUNNING, &linkwatch_flags);
94
95 rtnl_shlock();
96 linkwatch_run_queue();
97 rtnl_shunlock();
98}
99
100
101void linkwatch_fire_event(struct net_device *dev)
102{
103 if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) {
104 unsigned long flags;
105 struct lw_event *event;
106
107 if (test_and_set_bit(LW_SE_USED, &linkwatch_flags)) {
108 event = kmalloc(sizeof(struct lw_event), GFP_ATOMIC);
109
110 if (unlikely(event == NULL)) {
111 clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state);
112 return;
113 }
114 } else {
115 event = &singleevent;
116 }
117
118 dev_hold(dev);
119 event->dev = dev;
120
121 spin_lock_irqsave(&lweventlist_lock, flags);
122 list_add_tail(&event->list, &lweventlist);
123 spin_unlock_irqrestore(&lweventlist_lock, flags);
124
125 if (!test_and_set_bit(LW_RUNNING, &linkwatch_flags)) {
126 unsigned long thisevent = jiffies;
127
128 if (thisevent >= linkwatch_nextevent) {
129 schedule_work(&linkwatch_work);
130 } else {
131 schedule_delayed_work(&linkwatch_work, linkwatch_nextevent - thisevent);
132 }
133 }
134 }
135}
136
137EXPORT_SYMBOL(linkwatch_fire_event);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
new file mode 100644
index 000000000000..0a2f67bbef2e
--- /dev/null
+++ b/net/core/neighbour.c
@@ -0,0 +1,2362 @@
1/*
2 * Generic address resolution entity
3 *
4 * Authors:
5 * Pedro Roque <roque@di.fc.ul.pt>
6 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 *
13 * Fixes:
14 * Vitaly E. Lavrov releasing NULL neighbor in neigh_add.
15 * Harald Welte Add neighbour cache statistics like rtstat
16 */
17
18#include <linux/config.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
21#include <linux/module.h>
22#include <linux/socket.h>
23#include <linux/sched.h>
24#include <linux/netdevice.h>
25#include <linux/proc_fs.h>
26#ifdef CONFIG_SYSCTL
27#include <linux/sysctl.h>
28#endif
29#include <linux/times.h>
30#include <net/neighbour.h>
31#include <net/dst.h>
32#include <net/sock.h>
33#include <linux/rtnetlink.h>
34#include <linux/random.h>
35
36#define NEIGH_DEBUG 1
37
38#define NEIGH_PRINTK(x...) printk(x)
39#define NEIGH_NOPRINTK(x...) do { ; } while(0)
40#define NEIGH_PRINTK0 NEIGH_PRINTK
41#define NEIGH_PRINTK1 NEIGH_NOPRINTK
42#define NEIGH_PRINTK2 NEIGH_NOPRINTK
43
44#if NEIGH_DEBUG >= 1
45#undef NEIGH_PRINTK1
46#define NEIGH_PRINTK1 NEIGH_PRINTK
47#endif
48#if NEIGH_DEBUG >= 2
49#undef NEIGH_PRINTK2
50#define NEIGH_PRINTK2 NEIGH_PRINTK
51#endif
52
53#define PNEIGH_HASHMASK 0xF
54
55static void neigh_timer_handler(unsigned long arg);
56#ifdef CONFIG_ARPD
57static void neigh_app_notify(struct neighbour *n);
58#endif
59static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev);
60void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev);
61
62static struct neigh_table *neigh_tables;
63static struct file_operations neigh_stat_seq_fops;
64
65/*
66 Neighbour hash table buckets are protected with rwlock tbl->lock.
67
68 - All the scans/updates to hash buckets MUST be made under this lock.
69 - NOTHING clever should be made under this lock: no callbacks
70 to protocol backends, no attempts to send something to network.
71 It will result in deadlocks, if backend/driver wants to use neighbour
72 cache.
73 - If the entry requires some non-trivial actions, increase
74 its reference count and release table lock.
75
76 Neighbour entries are protected:
77 - with reference count.
78 - with rwlock neigh->lock
79
80 Reference count prevents destruction.
81
82 neigh->lock mainly serializes ll address data and its validity state.
83 However, the same lock is used to protect another entry fields:
84 - timer
85 - resolution queue
86
87 Again, nothing clever shall be made under neigh->lock,
88 the most complicated procedure, which we allow is dev->hard_header.
89 It is supposed, that dev->hard_header is simplistic and does
90 not make callbacks to neighbour tables.
91
92 The last lock is neigh_tbl_lock. It is pure SMP lock, protecting
93 list of neighbour tables. This list is used only in process context,
94 */
95
96static DEFINE_RWLOCK(neigh_tbl_lock);
97
98static int neigh_blackhole(struct sk_buff *skb)
99{
100 kfree_skb(skb);
101 return -ENETDOWN;
102}
103
104/*
105 * It is random distribution in the interval (1/2)*base...(3/2)*base.
106 * It corresponds to default IPv6 settings and is not overridable,
107 * because it is really reasonable choice.
108 */
109
110unsigned long neigh_rand_reach_time(unsigned long base)
111{
112 return (base ? (net_random() % base) + (base >> 1) : 0);
113}
114
115
116static int neigh_forced_gc(struct neigh_table *tbl)
117{
118 int shrunk = 0;
119 int i;
120
121 NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs);
122
123 write_lock_bh(&tbl->lock);
124 for (i = 0; i <= tbl->hash_mask; i++) {
125 struct neighbour *n, **np;
126
127 np = &tbl->hash_buckets[i];
128 while ((n = *np) != NULL) {
129 /* Neighbour record may be discarded if:
130 * - nobody refers to it.
131 * - it is not permanent
132 */
133 write_lock(&n->lock);
134 if (atomic_read(&n->refcnt) == 1 &&
135 !(n->nud_state & NUD_PERMANENT)) {
136 *np = n->next;
137 n->dead = 1;
138 shrunk = 1;
139 write_unlock(&n->lock);
140 neigh_release(n);
141 continue;
142 }
143 write_unlock(&n->lock);
144 np = &n->next;
145 }
146 }
147
148 tbl->last_flush = jiffies;
149
150 write_unlock_bh(&tbl->lock);
151
152 return shrunk;
153}
154
155static int neigh_del_timer(struct neighbour *n)
156{
157 if ((n->nud_state & NUD_IN_TIMER) &&
158 del_timer(&n->timer)) {
159 neigh_release(n);
160 return 1;
161 }
162 return 0;
163}
164
165static void pneigh_queue_purge(struct sk_buff_head *list)
166{
167 struct sk_buff *skb;
168
169 while ((skb = skb_dequeue(list)) != NULL) {
170 dev_put(skb->dev);
171 kfree_skb(skb);
172 }
173}
174
175void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev)
176{
177 int i;
178
179 write_lock_bh(&tbl->lock);
180
181 for (i=0; i <= tbl->hash_mask; i++) {
182 struct neighbour *n, **np;
183
184 np = &tbl->hash_buckets[i];
185 while ((n = *np) != NULL) {
186 if (dev && n->dev != dev) {
187 np = &n->next;
188 continue;
189 }
190 *np = n->next;
191 write_lock_bh(&n->lock);
192 n->dead = 1;
193 neigh_del_timer(n);
194 write_unlock_bh(&n->lock);
195 neigh_release(n);
196 }
197 }
198
199 write_unlock_bh(&tbl->lock);
200}
201
202int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
203{
204 int i;
205
206 write_lock_bh(&tbl->lock);
207
208 for (i = 0; i <= tbl->hash_mask; i++) {
209 struct neighbour *n, **np = &tbl->hash_buckets[i];
210
211 while ((n = *np) != NULL) {
212 if (dev && n->dev != dev) {
213 np = &n->next;
214 continue;
215 }
216 *np = n->next;
217 write_lock(&n->lock);
218 neigh_del_timer(n);
219 n->dead = 1;
220
221 if (atomic_read(&n->refcnt) != 1) {
222 /* The most unpleasant situation.
223 We must destroy neighbour entry,
224 but someone still uses it.
225
226 The destroy will be delayed until
227 the last user releases us, but
228 we must kill timers etc. and move
229 it to safe state.
230 */
231 skb_queue_purge(&n->arp_queue);
232 n->output = neigh_blackhole;
233 if (n->nud_state & NUD_VALID)
234 n->nud_state = NUD_NOARP;
235 else
236 n->nud_state = NUD_NONE;
237 NEIGH_PRINTK2("neigh %p is stray.\n", n);
238 }
239 write_unlock(&n->lock);
240 neigh_release(n);
241 }
242 }
243
244 pneigh_ifdown(tbl, dev);
245 write_unlock_bh(&tbl->lock);
246
247 del_timer_sync(&tbl->proxy_timer);
248 pneigh_queue_purge(&tbl->proxy_queue);
249 return 0;
250}
251
252static struct neighbour *neigh_alloc(struct neigh_table *tbl)
253{
254 struct neighbour *n = NULL;
255 unsigned long now = jiffies;
256 int entries;
257
258 entries = atomic_inc_return(&tbl->entries) - 1;
259 if (entries >= tbl->gc_thresh3 ||
260 (entries >= tbl->gc_thresh2 &&
261 time_after(now, tbl->last_flush + 5 * HZ))) {
262 if (!neigh_forced_gc(tbl) &&
263 entries >= tbl->gc_thresh3)
264 goto out_entries;
265 }
266
267 n = kmem_cache_alloc(tbl->kmem_cachep, SLAB_ATOMIC);
268 if (!n)
269 goto out_entries;
270
271 memset(n, 0, tbl->entry_size);
272
273 skb_queue_head_init(&n->arp_queue);
274 rwlock_init(&n->lock);
275 n->updated = n->used = now;
276 n->nud_state = NUD_NONE;
277 n->output = neigh_blackhole;
278 n->parms = neigh_parms_clone(&tbl->parms);
279 init_timer(&n->timer);
280 n->timer.function = neigh_timer_handler;
281 n->timer.data = (unsigned long)n;
282
283 NEIGH_CACHE_STAT_INC(tbl, allocs);
284 n->tbl = tbl;
285 atomic_set(&n->refcnt, 1);
286 n->dead = 1;
287out:
288 return n;
289
290out_entries:
291 atomic_dec(&tbl->entries);
292 goto out;
293}
294
295static struct neighbour **neigh_hash_alloc(unsigned int entries)
296{
297 unsigned long size = entries * sizeof(struct neighbour *);
298 struct neighbour **ret;
299
300 if (size <= PAGE_SIZE) {
301 ret = kmalloc(size, GFP_ATOMIC);
302 } else {
303 ret = (struct neighbour **)
304 __get_free_pages(GFP_ATOMIC, get_order(size));
305 }
306 if (ret)
307 memset(ret, 0, size);
308
309 return ret;
310}
311
312static void neigh_hash_free(struct neighbour **hash, unsigned int entries)
313{
314 unsigned long size = entries * sizeof(struct neighbour *);
315
316 if (size <= PAGE_SIZE)
317 kfree(hash);
318 else
319 free_pages((unsigned long)hash, get_order(size));
320}
321
322static void neigh_hash_grow(struct neigh_table *tbl, unsigned long new_entries)
323{
324 struct neighbour **new_hash, **old_hash;
325 unsigned int i, new_hash_mask, old_entries;
326
327 NEIGH_CACHE_STAT_INC(tbl, hash_grows);
328
329 BUG_ON(new_entries & (new_entries - 1));
330 new_hash = neigh_hash_alloc(new_entries);
331 if (!new_hash)
332 return;
333
334 old_entries = tbl->hash_mask + 1;
335 new_hash_mask = new_entries - 1;
336 old_hash = tbl->hash_buckets;
337
338 get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
339 for (i = 0; i < old_entries; i++) {
340 struct neighbour *n, *next;
341
342 for (n = old_hash[i]; n; n = next) {
343 unsigned int hash_val = tbl->hash(n->primary_key, n->dev);
344
345 hash_val &= new_hash_mask;
346 next = n->next;
347
348 n->next = new_hash[hash_val];
349 new_hash[hash_val] = n;
350 }
351 }
352 tbl->hash_buckets = new_hash;
353 tbl->hash_mask = new_hash_mask;
354
355 neigh_hash_free(old_hash, old_entries);
356}
357
358struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
359 struct net_device *dev)
360{
361 struct neighbour *n;
362 int key_len = tbl->key_len;
363 u32 hash_val = tbl->hash(pkey, dev) & tbl->hash_mask;
364
365 NEIGH_CACHE_STAT_INC(tbl, lookups);
366
367 read_lock_bh(&tbl->lock);
368 for (n = tbl->hash_buckets[hash_val]; n; n = n->next) {
369 if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) {
370 neigh_hold(n);
371 NEIGH_CACHE_STAT_INC(tbl, hits);
372 break;
373 }
374 }
375 read_unlock_bh(&tbl->lock);
376 return n;
377}
378
379struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, const void *pkey)
380{
381 struct neighbour *n;
382 int key_len = tbl->key_len;
383 u32 hash_val = tbl->hash(pkey, NULL) & tbl->hash_mask;
384
385 NEIGH_CACHE_STAT_INC(tbl, lookups);
386
387 read_lock_bh(&tbl->lock);
388 for (n = tbl->hash_buckets[hash_val]; n; n = n->next) {
389 if (!memcmp(n->primary_key, pkey, key_len)) {
390 neigh_hold(n);
391 NEIGH_CACHE_STAT_INC(tbl, hits);
392 break;
393 }
394 }
395 read_unlock_bh(&tbl->lock);
396 return n;
397}
398
399struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
400 struct net_device *dev)
401{
402 u32 hash_val;
403 int key_len = tbl->key_len;
404 int error;
405 struct neighbour *n1, *rc, *n = neigh_alloc(tbl);
406
407 if (!n) {
408 rc = ERR_PTR(-ENOBUFS);
409 goto out;
410 }
411
412 memcpy(n->primary_key, pkey, key_len);
413 n->dev = dev;
414 dev_hold(dev);
415
416 /* Protocol specific setup. */
417 if (tbl->constructor && (error = tbl->constructor(n)) < 0) {
418 rc = ERR_PTR(error);
419 goto out_neigh_release;
420 }
421
422 /* Device specific setup. */
423 if (n->parms->neigh_setup &&
424 (error = n->parms->neigh_setup(n)) < 0) {
425 rc = ERR_PTR(error);
426 goto out_neigh_release;
427 }
428
429 n->confirmed = jiffies - (n->parms->base_reachable_time << 1);
430
431 write_lock_bh(&tbl->lock);
432
433 if (atomic_read(&tbl->entries) > (tbl->hash_mask + 1))
434 neigh_hash_grow(tbl, (tbl->hash_mask + 1) << 1);
435
436 hash_val = tbl->hash(pkey, dev) & tbl->hash_mask;
437
438 if (n->parms->dead) {
439 rc = ERR_PTR(-EINVAL);
440 goto out_tbl_unlock;
441 }
442
443 for (n1 = tbl->hash_buckets[hash_val]; n1; n1 = n1->next) {
444 if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) {
445 neigh_hold(n1);
446 rc = n1;
447 goto out_tbl_unlock;
448 }
449 }
450
451 n->next = tbl->hash_buckets[hash_val];
452 tbl->hash_buckets[hash_val] = n;
453 n->dead = 0;
454 neigh_hold(n);
455 write_unlock_bh(&tbl->lock);
456 NEIGH_PRINTK2("neigh %p is created.\n", n);
457 rc = n;
458out:
459 return rc;
460out_tbl_unlock:
461 write_unlock_bh(&tbl->lock);
462out_neigh_release:
463 neigh_release(n);
464 goto out;
465}
466
467struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, const void *pkey,
468 struct net_device *dev, int creat)
469{
470 struct pneigh_entry *n;
471 int key_len = tbl->key_len;
472 u32 hash_val = *(u32 *)(pkey + key_len - 4);
473
474 hash_val ^= (hash_val >> 16);
475 hash_val ^= hash_val >> 8;
476 hash_val ^= hash_val >> 4;
477 hash_val &= PNEIGH_HASHMASK;
478
479 read_lock_bh(&tbl->lock);
480
481 for (n = tbl->phash_buckets[hash_val]; n; n = n->next) {
482 if (!memcmp(n->key, pkey, key_len) &&
483 (n->dev == dev || !n->dev)) {
484 read_unlock_bh(&tbl->lock);
485 goto out;
486 }
487 }
488 read_unlock_bh(&tbl->lock);
489 n = NULL;
490 if (!creat)
491 goto out;
492
493 n = kmalloc(sizeof(*n) + key_len, GFP_KERNEL);
494 if (!n)
495 goto out;
496
497 memcpy(n->key, pkey, key_len);
498 n->dev = dev;
499 if (dev)
500 dev_hold(dev);
501
502 if (tbl->pconstructor && tbl->pconstructor(n)) {
503 if (dev)
504 dev_put(dev);
505 kfree(n);
506 n = NULL;
507 goto out;
508 }
509
510 write_lock_bh(&tbl->lock);
511 n->next = tbl->phash_buckets[hash_val];
512 tbl->phash_buckets[hash_val] = n;
513 write_unlock_bh(&tbl->lock);
514out:
515 return n;
516}
517
518
519int pneigh_delete(struct neigh_table *tbl, const void *pkey,
520 struct net_device *dev)
521{
522 struct pneigh_entry *n, **np;
523 int key_len = tbl->key_len;
524 u32 hash_val = *(u32 *)(pkey + key_len - 4);
525
526 hash_val ^= (hash_val >> 16);
527 hash_val ^= hash_val >> 8;
528 hash_val ^= hash_val >> 4;
529 hash_val &= PNEIGH_HASHMASK;
530
531 write_lock_bh(&tbl->lock);
532 for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL;
533 np = &n->next) {
534 if (!memcmp(n->key, pkey, key_len) && n->dev == dev) {
535 *np = n->next;
536 write_unlock_bh(&tbl->lock);
537 if (tbl->pdestructor)
538 tbl->pdestructor(n);
539 if (n->dev)
540 dev_put(n->dev);
541 kfree(n);
542 return 0;
543 }
544 }
545 write_unlock_bh(&tbl->lock);
546 return -ENOENT;
547}
548
549static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
550{
551 struct pneigh_entry *n, **np;
552 u32 h;
553
554 for (h = 0; h <= PNEIGH_HASHMASK; h++) {
555 np = &tbl->phash_buckets[h];
556 while ((n = *np) != NULL) {
557 if (!dev || n->dev == dev) {
558 *np = n->next;
559 if (tbl->pdestructor)
560 tbl->pdestructor(n);
561 if (n->dev)
562 dev_put(n->dev);
563 kfree(n);
564 continue;
565 }
566 np = &n->next;
567 }
568 }
569 return -ENOENT;
570}
571
572
573/*
574 * neighbour must already be out of the table;
575 *
576 */
577void neigh_destroy(struct neighbour *neigh)
578{
579 struct hh_cache *hh;
580
581 NEIGH_CACHE_STAT_INC(neigh->tbl, destroys);
582
583 if (!neigh->dead) {
584 printk(KERN_WARNING
585 "Destroying alive neighbour %p\n", neigh);
586 dump_stack();
587 return;
588 }
589
590 if (neigh_del_timer(neigh))
591 printk(KERN_WARNING "Impossible event.\n");
592
593 while ((hh = neigh->hh) != NULL) {
594 neigh->hh = hh->hh_next;
595 hh->hh_next = NULL;
596 write_lock_bh(&hh->hh_lock);
597 hh->hh_output = neigh_blackhole;
598 write_unlock_bh(&hh->hh_lock);
599 if (atomic_dec_and_test(&hh->hh_refcnt))
600 kfree(hh);
601 }
602
603 if (neigh->ops && neigh->ops->destructor)
604 (neigh->ops->destructor)(neigh);
605
606 skb_queue_purge(&neigh->arp_queue);
607
608 dev_put(neigh->dev);
609 neigh_parms_put(neigh->parms);
610
611 NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh);
612
613 atomic_dec(&neigh->tbl->entries);
614 kmem_cache_free(neigh->tbl->kmem_cachep, neigh);
615}
616
617/* Neighbour state is suspicious;
618 disable fast path.
619
620 Called with write_locked neigh.
621 */
622static void neigh_suspect(struct neighbour *neigh)
623{
624 struct hh_cache *hh;
625
626 NEIGH_PRINTK2("neigh %p is suspected.\n", neigh);
627
628 neigh->output = neigh->ops->output;
629
630 for (hh = neigh->hh; hh; hh = hh->hh_next)
631 hh->hh_output = neigh->ops->output;
632}
633
634/* Neighbour state is OK;
635 enable fast path.
636
637 Called with write_locked neigh.
638 */
639static void neigh_connect(struct neighbour *neigh)
640{
641 struct hh_cache *hh;
642
643 NEIGH_PRINTK2("neigh %p is connected.\n", neigh);
644
645 neigh->output = neigh->ops->connected_output;
646
647 for (hh = neigh->hh; hh; hh = hh->hh_next)
648 hh->hh_output = neigh->ops->hh_output;
649}
650
651static void neigh_periodic_timer(unsigned long arg)
652{
653 struct neigh_table *tbl = (struct neigh_table *)arg;
654 struct neighbour *n, **np;
655 unsigned long expire, now = jiffies;
656
657 NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);
658
659 write_lock(&tbl->lock);
660
661 /*
662 * periodically recompute ReachableTime from random function
663 */
664
665 if (time_after(now, tbl->last_rand + 300 * HZ)) {
666 struct neigh_parms *p;
667 tbl->last_rand = now;
668 for (p = &tbl->parms; p; p = p->next)
669 p->reachable_time =
670 neigh_rand_reach_time(p->base_reachable_time);
671 }
672
673 np = &tbl->hash_buckets[tbl->hash_chain_gc];
674 tbl->hash_chain_gc = ((tbl->hash_chain_gc + 1) & tbl->hash_mask);
675
676 while ((n = *np) != NULL) {
677 unsigned int state;
678
679 write_lock(&n->lock);
680
681 state = n->nud_state;
682 if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {
683 write_unlock(&n->lock);
684 goto next_elt;
685 }
686
687 if (time_before(n->used, n->confirmed))
688 n->used = n->confirmed;
689
690 if (atomic_read(&n->refcnt) == 1 &&
691 (state == NUD_FAILED ||
692 time_after(now, n->used + n->parms->gc_staletime))) {
693 *np = n->next;
694 n->dead = 1;
695 write_unlock(&n->lock);
696 neigh_release(n);
697 continue;
698 }
699 write_unlock(&n->lock);
700
701next_elt:
702 np = &n->next;
703 }
704
705 /* Cycle through all hash buckets every base_reachable_time/2 ticks.
706 * ARP entry timeouts range from 1/2 base_reachable_time to 3/2
707 * base_reachable_time.
708 */
709 expire = tbl->parms.base_reachable_time >> 1;
710 expire /= (tbl->hash_mask + 1);
711 if (!expire)
712 expire = 1;
713
714 mod_timer(&tbl->gc_timer, now + expire);
715
716 write_unlock(&tbl->lock);
717}
718
719static __inline__ int neigh_max_probes(struct neighbour *n)
720{
721 struct neigh_parms *p = n->parms;
722 return (n->nud_state & NUD_PROBE ?
723 p->ucast_probes :
724 p->ucast_probes + p->app_probes + p->mcast_probes);
725}
726
727
728/* Called when a timer expires for a neighbour entry. */
729
730static void neigh_timer_handler(unsigned long arg)
731{
732 unsigned long now, next;
733 struct neighbour *neigh = (struct neighbour *)arg;
734 unsigned state;
735 int notify = 0;
736
737 write_lock(&neigh->lock);
738
739 state = neigh->nud_state;
740 now = jiffies;
741 next = now + HZ;
742
743 if (!(state & NUD_IN_TIMER)) {
744#ifndef CONFIG_SMP
745 printk(KERN_WARNING "neigh: timer & !nud_in_timer\n");
746#endif
747 goto out;
748 }
749
750 if (state & NUD_REACHABLE) {
751 if (time_before_eq(now,
752 neigh->confirmed + neigh->parms->reachable_time)) {
753 NEIGH_PRINTK2("neigh %p is still alive.\n", neigh);
754 next = neigh->confirmed + neigh->parms->reachable_time;
755 } else if (time_before_eq(now,
756 neigh->used + neigh->parms->delay_probe_time)) {
757 NEIGH_PRINTK2("neigh %p is delayed.\n", neigh);
758 neigh->nud_state = NUD_DELAY;
759 neigh_suspect(neigh);
760 next = now + neigh->parms->delay_probe_time;
761 } else {
762 NEIGH_PRINTK2("neigh %p is suspected.\n", neigh);
763 neigh->nud_state = NUD_STALE;
764 neigh_suspect(neigh);
765 }
766 } else if (state & NUD_DELAY) {
767 if (time_before_eq(now,
768 neigh->confirmed + neigh->parms->delay_probe_time)) {
769 NEIGH_PRINTK2("neigh %p is now reachable.\n", neigh);
770 neigh->nud_state = NUD_REACHABLE;
771 neigh_connect(neigh);
772 next = neigh->confirmed + neigh->parms->reachable_time;
773 } else {
774 NEIGH_PRINTK2("neigh %p is probed.\n", neigh);
775 neigh->nud_state = NUD_PROBE;
776 atomic_set(&neigh->probes, 0);
777 next = now + neigh->parms->retrans_time;
778 }
779 } else {
780 /* NUD_PROBE|NUD_INCOMPLETE */
781 next = now + neigh->parms->retrans_time;
782 }
783
784 if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) &&
785 atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) {
786 struct sk_buff *skb;
787
788 neigh->nud_state = NUD_FAILED;
789 notify = 1;
790 NEIGH_CACHE_STAT_INC(neigh->tbl, res_failed);
791 NEIGH_PRINTK2("neigh %p is failed.\n", neigh);
792
793 /* It is very thin place. report_unreachable is very complicated
794 routine. Particularly, it can hit the same neighbour entry!
795
796 So that, we try to be accurate and avoid dead loop. --ANK
797 */
798 while (neigh->nud_state == NUD_FAILED &&
799 (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
800 write_unlock(&neigh->lock);
801 neigh->ops->error_report(neigh, skb);
802 write_lock(&neigh->lock);
803 }
804 skb_queue_purge(&neigh->arp_queue);
805 }
806
807 if (neigh->nud_state & NUD_IN_TIMER) {
808 neigh_hold(neigh);
809 if (time_before(next, jiffies + HZ/2))
810 next = jiffies + HZ/2;
811 neigh->timer.expires = next;
812 add_timer(&neigh->timer);
813 }
814 if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) {
815 struct sk_buff *skb = skb_peek(&neigh->arp_queue);
816 /* keep skb alive even if arp_queue overflows */
817 if (skb)
818 skb_get(skb);
819 write_unlock(&neigh->lock);
820 neigh->ops->solicit(neigh, skb);
821 atomic_inc(&neigh->probes);
822 if (skb)
823 kfree_skb(skb);
824 } else {
825out:
826 write_unlock(&neigh->lock);
827 }
828
829#ifdef CONFIG_ARPD
830 if (notify && neigh->parms->app_probes)
831 neigh_app_notify(neigh);
832#endif
833 neigh_release(neigh);
834}
835
836int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
837{
838 int rc;
839 unsigned long now;
840
841 write_lock_bh(&neigh->lock);
842
843 rc = 0;
844 if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))
845 goto out_unlock_bh;
846
847 now = jiffies;
848
849 if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) {
850 if (neigh->parms->mcast_probes + neigh->parms->app_probes) {
851 atomic_set(&neigh->probes, neigh->parms->ucast_probes);
852 neigh->nud_state = NUD_INCOMPLETE;
853 neigh_hold(neigh);
854 neigh->timer.expires = now + 1;
855 add_timer(&neigh->timer);
856 } else {
857 neigh->nud_state = NUD_FAILED;
858 write_unlock_bh(&neigh->lock);
859
860 if (skb)
861 kfree_skb(skb);
862 return 1;
863 }
864 } else if (neigh->nud_state & NUD_STALE) {
865 NEIGH_PRINTK2("neigh %p is delayed.\n", neigh);
866 neigh_hold(neigh);
867 neigh->nud_state = NUD_DELAY;
868 neigh->timer.expires = jiffies + neigh->parms->delay_probe_time;
869 add_timer(&neigh->timer);
870 }
871
872 if (neigh->nud_state == NUD_INCOMPLETE) {
873 if (skb) {
874 if (skb_queue_len(&neigh->arp_queue) >=
875 neigh->parms->queue_len) {
876 struct sk_buff *buff;
877 buff = neigh->arp_queue.next;
878 __skb_unlink(buff, &neigh->arp_queue);
879 kfree_skb(buff);
880 }
881 __skb_queue_tail(&neigh->arp_queue, skb);
882 }
883 rc = 1;
884 }
885out_unlock_bh:
886 write_unlock_bh(&neigh->lock);
887 return rc;
888}
889
890static __inline__ void neigh_update_hhs(struct neighbour *neigh)
891{
892 struct hh_cache *hh;
893 void (*update)(struct hh_cache*, struct net_device*, unsigned char *) =
894 neigh->dev->header_cache_update;
895
896 if (update) {
897 for (hh = neigh->hh; hh; hh = hh->hh_next) {
898 write_lock_bh(&hh->hh_lock);
899 update(hh, neigh->dev, neigh->ha);
900 write_unlock_bh(&hh->hh_lock);
901 }
902 }
903}
904
905
906
907/* Generic update routine.
908 -- lladdr is new lladdr or NULL, if it is not supplied.
909 -- new is new state.
910 -- flags
911 NEIGH_UPDATE_F_OVERRIDE allows to override existing lladdr,
912 if it is different.
913 NEIGH_UPDATE_F_WEAK_OVERRIDE will suspect existing "connected"
914 lladdr instead of overriding it
915 if it is different.
916 It also allows to retain current state
917 if lladdr is unchanged.
918 NEIGH_UPDATE_F_ADMIN means that the change is administrative.
919
920 NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing
921 NTF_ROUTER flag.
922 NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as
923 a router.
924
925 Caller MUST hold reference count on the entry.
926 */
927
928int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
929 u32 flags)
930{
931 u8 old;
932 int err;
933#ifdef CONFIG_ARPD
934 int notify = 0;
935#endif
936 struct net_device *dev;
937 int update_isrouter = 0;
938
939 write_lock_bh(&neigh->lock);
940
941 dev = neigh->dev;
942 old = neigh->nud_state;
943 err = -EPERM;
944
945 if (!(flags & NEIGH_UPDATE_F_ADMIN) &&
946 (old & (NUD_NOARP | NUD_PERMANENT)))
947 goto out;
948
949 if (!(new & NUD_VALID)) {
950 neigh_del_timer(neigh);
951 if (old & NUD_CONNECTED)
952 neigh_suspect(neigh);
953 neigh->nud_state = new;
954 err = 0;
955#ifdef CONFIG_ARPD
956 notify = old & NUD_VALID;
957#endif
958 goto out;
959 }
960
961 /* Compare new lladdr with cached one */
962 if (!dev->addr_len) {
963 /* First case: device needs no address. */
964 lladdr = neigh->ha;
965 } else if (lladdr) {
966 /* The second case: if something is already cached
967 and a new address is proposed:
968 - compare new & old
969 - if they are different, check override flag
970 */
971 if ((old & NUD_VALID) &&
972 !memcmp(lladdr, neigh->ha, dev->addr_len))
973 lladdr = neigh->ha;
974 } else {
975 /* No address is supplied; if we know something,
976 use it, otherwise discard the request.
977 */
978 err = -EINVAL;
979 if (!(old & NUD_VALID))
980 goto out;
981 lladdr = neigh->ha;
982 }
983
984 if (new & NUD_CONNECTED)
985 neigh->confirmed = jiffies;
986 neigh->updated = jiffies;
987
988 /* If entry was valid and address is not changed,
989 do not change entry state, if new one is STALE.
990 */
991 err = 0;
992 update_isrouter = flags & NEIGH_UPDATE_F_OVERRIDE_ISROUTER;
993 if (old & NUD_VALID) {
994 if (lladdr != neigh->ha && !(flags & NEIGH_UPDATE_F_OVERRIDE)) {
995 update_isrouter = 0;
996 if ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) &&
997 (old & NUD_CONNECTED)) {
998 lladdr = neigh->ha;
999 new = NUD_STALE;
1000 } else
1001 goto out;
1002 } else {
1003 if (lladdr == neigh->ha && new == NUD_STALE &&
1004 ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) ||
1005 (old & NUD_CONNECTED))
1006 )
1007 new = old;
1008 }
1009 }
1010
1011 if (new != old) {
1012 neigh_del_timer(neigh);
1013 if (new & NUD_IN_TIMER) {
1014 neigh_hold(neigh);
1015 neigh->timer.expires = jiffies +
1016 ((new & NUD_REACHABLE) ?
1017 neigh->parms->reachable_time : 0);
1018 add_timer(&neigh->timer);
1019 }
1020 neigh->nud_state = new;
1021 }
1022
1023 if (lladdr != neigh->ha) {
1024 memcpy(&neigh->ha, lladdr, dev->addr_len);
1025 neigh_update_hhs(neigh);
1026 if (!(new & NUD_CONNECTED))
1027 neigh->confirmed = jiffies -
1028 (neigh->parms->base_reachable_time << 1);
1029#ifdef CONFIG_ARPD
1030 notify = 1;
1031#endif
1032 }
1033 if (new == old)
1034 goto out;
1035 if (new & NUD_CONNECTED)
1036 neigh_connect(neigh);
1037 else
1038 neigh_suspect(neigh);
1039 if (!(old & NUD_VALID)) {
1040 struct sk_buff *skb;
1041
1042 /* Again: avoid dead loop if something went wrong */
1043
1044 while (neigh->nud_state & NUD_VALID &&
1045 (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
1046 struct neighbour *n1 = neigh;
1047 write_unlock_bh(&neigh->lock);
1048 /* On shaper/eql skb->dst->neighbour != neigh :( */
1049 if (skb->dst && skb->dst->neighbour)
1050 n1 = skb->dst->neighbour;
1051 n1->output(skb);
1052 write_lock_bh(&neigh->lock);
1053 }
1054 skb_queue_purge(&neigh->arp_queue);
1055 }
1056out:
1057 if (update_isrouter) {
1058 neigh->flags = (flags & NEIGH_UPDATE_F_ISROUTER) ?
1059 (neigh->flags | NTF_ROUTER) :
1060 (neigh->flags & ~NTF_ROUTER);
1061 }
1062 write_unlock_bh(&neigh->lock);
1063#ifdef CONFIG_ARPD
1064 if (notify && neigh->parms->app_probes)
1065 neigh_app_notify(neigh);
1066#endif
1067 return err;
1068}
1069
1070struct neighbour *neigh_event_ns(struct neigh_table *tbl,
1071 u8 *lladdr, void *saddr,
1072 struct net_device *dev)
1073{
1074 struct neighbour *neigh = __neigh_lookup(tbl, saddr, dev,
1075 lladdr || !dev->addr_len);
1076 if (neigh)
1077 neigh_update(neigh, lladdr, NUD_STALE,
1078 NEIGH_UPDATE_F_OVERRIDE);
1079 return neigh;
1080}
1081
1082static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst,
1083 u16 protocol)
1084{
1085 struct hh_cache *hh;
1086 struct net_device *dev = dst->dev;
1087
1088 for (hh = n->hh; hh; hh = hh->hh_next)
1089 if (hh->hh_type == protocol)
1090 break;
1091
1092 if (!hh && (hh = kmalloc(sizeof(*hh), GFP_ATOMIC)) != NULL) {
1093 memset(hh, 0, sizeof(struct hh_cache));
1094 rwlock_init(&hh->hh_lock);
1095 hh->hh_type = protocol;
1096 atomic_set(&hh->hh_refcnt, 0);
1097 hh->hh_next = NULL;
1098 if (dev->hard_header_cache(n, hh)) {
1099 kfree(hh);
1100 hh = NULL;
1101 } else {
1102 atomic_inc(&hh->hh_refcnt);
1103 hh->hh_next = n->hh;
1104 n->hh = hh;
1105 if (n->nud_state & NUD_CONNECTED)
1106 hh->hh_output = n->ops->hh_output;
1107 else
1108 hh->hh_output = n->ops->output;
1109 }
1110 }
1111 if (hh) {
1112 atomic_inc(&hh->hh_refcnt);
1113 dst->hh = hh;
1114 }
1115}
1116
1117/* This function can be used in contexts, where only old dev_queue_xmit
1118 worked, f.e. if you want to override normal output path (eql, shaper),
1119 but resolution is not made yet.
1120 */
1121
1122int neigh_compat_output(struct sk_buff *skb)
1123{
1124 struct net_device *dev = skb->dev;
1125
1126 __skb_pull(skb, skb->nh.raw - skb->data);
1127
1128 if (dev->hard_header &&
1129 dev->hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL,
1130 skb->len) < 0 &&
1131 dev->rebuild_header(skb))
1132 return 0;
1133
1134 return dev_queue_xmit(skb);
1135}
1136
1137/* Slow and careful. */
1138
1139int neigh_resolve_output(struct sk_buff *skb)
1140{
1141 struct dst_entry *dst = skb->dst;
1142 struct neighbour *neigh;
1143 int rc = 0;
1144
1145 if (!dst || !(neigh = dst->neighbour))
1146 goto discard;
1147
1148 __skb_pull(skb, skb->nh.raw - skb->data);
1149
1150 if (!neigh_event_send(neigh, skb)) {
1151 int err;
1152 struct net_device *dev = neigh->dev;
1153 if (dev->hard_header_cache && !dst->hh) {
1154 write_lock_bh(&neigh->lock);
1155 if (!dst->hh)
1156 neigh_hh_init(neigh, dst, dst->ops->protocol);
1157 err = dev->hard_header(skb, dev, ntohs(skb->protocol),
1158 neigh->ha, NULL, skb->len);
1159 write_unlock_bh(&neigh->lock);
1160 } else {
1161 read_lock_bh(&neigh->lock);
1162 err = dev->hard_header(skb, dev, ntohs(skb->protocol),
1163 neigh->ha, NULL, skb->len);
1164 read_unlock_bh(&neigh->lock);
1165 }
1166 if (err >= 0)
1167 rc = neigh->ops->queue_xmit(skb);
1168 else
1169 goto out_kfree_skb;
1170 }
1171out:
1172 return rc;
1173discard:
1174 NEIGH_PRINTK1("neigh_resolve_output: dst=%p neigh=%p\n",
1175 dst, dst ? dst->neighbour : NULL);
1176out_kfree_skb:
1177 rc = -EINVAL;
1178 kfree_skb(skb);
1179 goto out;
1180}
1181
1182/* As fast as possible without hh cache */
1183
1184int neigh_connected_output(struct sk_buff *skb)
1185{
1186 int err;
1187 struct dst_entry *dst = skb->dst;
1188 struct neighbour *neigh = dst->neighbour;
1189 struct net_device *dev = neigh->dev;
1190
1191 __skb_pull(skb, skb->nh.raw - skb->data);
1192
1193 read_lock_bh(&neigh->lock);
1194 err = dev->hard_header(skb, dev, ntohs(skb->protocol),
1195 neigh->ha, NULL, skb->len);
1196 read_unlock_bh(&neigh->lock);
1197 if (err >= 0)
1198 err = neigh->ops->queue_xmit(skb);
1199 else {
1200 err = -EINVAL;
1201 kfree_skb(skb);
1202 }
1203 return err;
1204}
1205
1206static void neigh_proxy_process(unsigned long arg)
1207{
1208 struct neigh_table *tbl = (struct neigh_table *)arg;
1209 long sched_next = 0;
1210 unsigned long now = jiffies;
1211 struct sk_buff *skb;
1212
1213 spin_lock(&tbl->proxy_queue.lock);
1214
1215 skb = tbl->proxy_queue.next;
1216
1217 while (skb != (struct sk_buff *)&tbl->proxy_queue) {
1218 struct sk_buff *back = skb;
1219 long tdif = back->stamp.tv_usec - now;
1220
1221 skb = skb->next;
1222 if (tdif <= 0) {
1223 struct net_device *dev = back->dev;
1224 __skb_unlink(back, &tbl->proxy_queue);
1225 if (tbl->proxy_redo && netif_running(dev))
1226 tbl->proxy_redo(back);
1227 else
1228 kfree_skb(back);
1229
1230 dev_put(dev);
1231 } else if (!sched_next || tdif < sched_next)
1232 sched_next = tdif;
1233 }
1234 del_timer(&tbl->proxy_timer);
1235 if (sched_next)
1236 mod_timer(&tbl->proxy_timer, jiffies + sched_next);
1237 spin_unlock(&tbl->proxy_queue.lock);
1238}
1239
1240void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
1241 struct sk_buff *skb)
1242{
1243 unsigned long now = jiffies;
1244 unsigned long sched_next = now + (net_random() % p->proxy_delay);
1245
1246 if (tbl->proxy_queue.qlen > p->proxy_qlen) {
1247 kfree_skb(skb);
1248 return;
1249 }
1250 skb->stamp.tv_sec = LOCALLY_ENQUEUED;
1251 skb->stamp.tv_usec = sched_next;
1252
1253 spin_lock(&tbl->proxy_queue.lock);
1254 if (del_timer(&tbl->proxy_timer)) {
1255 if (time_before(tbl->proxy_timer.expires, sched_next))
1256 sched_next = tbl->proxy_timer.expires;
1257 }
1258 dst_release(skb->dst);
1259 skb->dst = NULL;
1260 dev_hold(skb->dev);
1261 __skb_queue_tail(&tbl->proxy_queue, skb);
1262 mod_timer(&tbl->proxy_timer, sched_next);
1263 spin_unlock(&tbl->proxy_queue.lock);
1264}
1265
1266
1267struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
1268 struct neigh_table *tbl)
1269{
1270 struct neigh_parms *p = kmalloc(sizeof(*p), GFP_KERNEL);
1271
1272 if (p) {
1273 memcpy(p, &tbl->parms, sizeof(*p));
1274 p->tbl = tbl;
1275 atomic_set(&p->refcnt, 1);
1276 INIT_RCU_HEAD(&p->rcu_head);
1277 p->reachable_time =
1278 neigh_rand_reach_time(p->base_reachable_time);
1279 if (dev && dev->neigh_setup && dev->neigh_setup(dev, p)) {
1280 kfree(p);
1281 return NULL;
1282 }
1283 p->sysctl_table = NULL;
1284 write_lock_bh(&tbl->lock);
1285 p->next = tbl->parms.next;
1286 tbl->parms.next = p;
1287 write_unlock_bh(&tbl->lock);
1288 }
1289 return p;
1290}
1291
1292static void neigh_rcu_free_parms(struct rcu_head *head)
1293{
1294 struct neigh_parms *parms =
1295 container_of(head, struct neigh_parms, rcu_head);
1296
1297 neigh_parms_put(parms);
1298}
1299
1300void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms)
1301{
1302 struct neigh_parms **p;
1303
1304 if (!parms || parms == &tbl->parms)
1305 return;
1306 write_lock_bh(&tbl->lock);
1307 for (p = &tbl->parms.next; *p; p = &(*p)->next) {
1308 if (*p == parms) {
1309 *p = parms->next;
1310 parms->dead = 1;
1311 write_unlock_bh(&tbl->lock);
1312 call_rcu(&parms->rcu_head, neigh_rcu_free_parms);
1313 return;
1314 }
1315 }
1316 write_unlock_bh(&tbl->lock);
1317 NEIGH_PRINTK1("neigh_parms_release: not found\n");
1318}
1319
1320void neigh_parms_destroy(struct neigh_parms *parms)
1321{
1322 kfree(parms);
1323}
1324
1325
1326void neigh_table_init(struct neigh_table *tbl)
1327{
1328 unsigned long now = jiffies;
1329 unsigned long phsize;
1330
1331 atomic_set(&tbl->parms.refcnt, 1);
1332 INIT_RCU_HEAD(&tbl->parms.rcu_head);
1333 tbl->parms.reachable_time =
1334 neigh_rand_reach_time(tbl->parms.base_reachable_time);
1335
1336 if (!tbl->kmem_cachep)
1337 tbl->kmem_cachep = kmem_cache_create(tbl->id,
1338 tbl->entry_size,
1339 0, SLAB_HWCACHE_ALIGN,
1340 NULL, NULL);
1341
1342 if (!tbl->kmem_cachep)
1343 panic("cannot create neighbour cache");
1344
1345 tbl->stats = alloc_percpu(struct neigh_statistics);
1346 if (!tbl->stats)
1347 panic("cannot create neighbour cache statistics");
1348
1349#ifdef CONFIG_PROC_FS
1350 tbl->pde = create_proc_entry(tbl->id, 0, proc_net_stat);
1351 if (!tbl->pde)
1352 panic("cannot create neighbour proc dir entry");
1353 tbl->pde->proc_fops = &neigh_stat_seq_fops;
1354 tbl->pde->data = tbl;
1355#endif
1356
1357 tbl->hash_mask = 1;
1358 tbl->hash_buckets = neigh_hash_alloc(tbl->hash_mask + 1);
1359
1360 phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *);
1361 tbl->phash_buckets = kmalloc(phsize, GFP_KERNEL);
1362
1363 if (!tbl->hash_buckets || !tbl->phash_buckets)
1364 panic("cannot allocate neighbour cache hashes");
1365
1366 memset(tbl->phash_buckets, 0, phsize);
1367
1368 get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
1369
1370 rwlock_init(&tbl->lock);
1371 init_timer(&tbl->gc_timer);
1372 tbl->gc_timer.data = (unsigned long)tbl;
1373 tbl->gc_timer.function = neigh_periodic_timer;
1374 tbl->gc_timer.expires = now + 1;
1375 add_timer(&tbl->gc_timer);
1376
1377 init_timer(&tbl->proxy_timer);
1378 tbl->proxy_timer.data = (unsigned long)tbl;
1379 tbl->proxy_timer.function = neigh_proxy_process;
1380 skb_queue_head_init(&tbl->proxy_queue);
1381
1382 tbl->last_flush = now;
1383 tbl->last_rand = now + tbl->parms.reachable_time * 20;
1384 write_lock(&neigh_tbl_lock);
1385 tbl->next = neigh_tables;
1386 neigh_tables = tbl;
1387 write_unlock(&neigh_tbl_lock);
1388}
1389
1390int neigh_table_clear(struct neigh_table *tbl)
1391{
1392 struct neigh_table **tp;
1393
1394 /* It is not clean... Fix it to unload IPv6 module safely */
1395 del_timer_sync(&tbl->gc_timer);
1396 del_timer_sync(&tbl->proxy_timer);
1397 pneigh_queue_purge(&tbl->proxy_queue);
1398 neigh_ifdown(tbl, NULL);
1399 if (atomic_read(&tbl->entries))
1400 printk(KERN_CRIT "neighbour leakage\n");
1401 write_lock(&neigh_tbl_lock);
1402 for (tp = &neigh_tables; *tp; tp = &(*tp)->next) {
1403 if (*tp == tbl) {
1404 *tp = tbl->next;
1405 break;
1406 }
1407 }
1408 write_unlock(&neigh_tbl_lock);
1409
1410 neigh_hash_free(tbl->hash_buckets, tbl->hash_mask + 1);
1411 tbl->hash_buckets = NULL;
1412
1413 kfree(tbl->phash_buckets);
1414 tbl->phash_buckets = NULL;
1415
1416 return 0;
1417}
1418
1419int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
1420{
1421 struct ndmsg *ndm = NLMSG_DATA(nlh);
1422 struct rtattr **nda = arg;
1423 struct neigh_table *tbl;
1424 struct net_device *dev = NULL;
1425 int err = -ENODEV;
1426
1427 if (ndm->ndm_ifindex &&
1428 (dev = dev_get_by_index(ndm->ndm_ifindex)) == NULL)
1429 goto out;
1430
1431 read_lock(&neigh_tbl_lock);
1432 for (tbl = neigh_tables; tbl; tbl = tbl->next) {
1433 struct rtattr *dst_attr = nda[NDA_DST - 1];
1434 struct neighbour *n;
1435
1436 if (tbl->family != ndm->ndm_family)
1437 continue;
1438 read_unlock(&neigh_tbl_lock);
1439
1440 err = -EINVAL;
1441 if (!dst_attr || RTA_PAYLOAD(dst_attr) < tbl->key_len)
1442 goto out_dev_put;
1443
1444 if (ndm->ndm_flags & NTF_PROXY) {
1445 err = pneigh_delete(tbl, RTA_DATA(dst_attr), dev);
1446 goto out_dev_put;
1447 }
1448
1449 if (!dev)
1450 goto out;
1451
1452 n = neigh_lookup(tbl, RTA_DATA(dst_attr), dev);
1453 if (n) {
1454 err = neigh_update(n, NULL, NUD_FAILED,
1455 NEIGH_UPDATE_F_OVERRIDE|
1456 NEIGH_UPDATE_F_ADMIN);
1457 neigh_release(n);
1458 }
1459 goto out_dev_put;
1460 }
1461 read_unlock(&neigh_tbl_lock);
1462 err = -EADDRNOTAVAIL;
1463out_dev_put:
1464 if (dev)
1465 dev_put(dev);
1466out:
1467 return err;
1468}
1469
1470int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
1471{
1472 struct ndmsg *ndm = NLMSG_DATA(nlh);
1473 struct rtattr **nda = arg;
1474 struct neigh_table *tbl;
1475 struct net_device *dev = NULL;
1476 int err = -ENODEV;
1477
1478 if (ndm->ndm_ifindex &&
1479 (dev = dev_get_by_index(ndm->ndm_ifindex)) == NULL)
1480 goto out;
1481
1482 read_lock(&neigh_tbl_lock);
1483 for (tbl = neigh_tables; tbl; tbl = tbl->next) {
1484 struct rtattr *lladdr_attr = nda[NDA_LLADDR - 1];
1485 struct rtattr *dst_attr = nda[NDA_DST - 1];
1486 int override = 1;
1487 struct neighbour *n;
1488
1489 if (tbl->family != ndm->ndm_family)
1490 continue;
1491 read_unlock(&neigh_tbl_lock);
1492
1493 err = -EINVAL;
1494 if (!dst_attr || RTA_PAYLOAD(dst_attr) < tbl->key_len)
1495 goto out_dev_put;
1496
1497 if (ndm->ndm_flags & NTF_PROXY) {
1498 err = -ENOBUFS;
1499 if (pneigh_lookup(tbl, RTA_DATA(dst_attr), dev, 1))
1500 err = 0;
1501 goto out_dev_put;
1502 }
1503
1504 err = -EINVAL;
1505 if (!dev)
1506 goto out;
1507 if (lladdr_attr && RTA_PAYLOAD(lladdr_attr) < dev->addr_len)
1508 goto out_dev_put;
1509
1510 n = neigh_lookup(tbl, RTA_DATA(dst_attr), dev);
1511 if (n) {
1512 if (nlh->nlmsg_flags & NLM_F_EXCL) {
1513 err = -EEXIST;
1514 neigh_release(n);
1515 goto out_dev_put;
1516 }
1517
1518 override = nlh->nlmsg_flags & NLM_F_REPLACE;
1519 } else if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
1520 err = -ENOENT;
1521 goto out_dev_put;
1522 } else {
1523 n = __neigh_lookup_errno(tbl, RTA_DATA(dst_attr), dev);
1524 if (IS_ERR(n)) {
1525 err = PTR_ERR(n);
1526 goto out_dev_put;
1527 }
1528 }
1529
1530 err = neigh_update(n,
1531 lladdr_attr ? RTA_DATA(lladdr_attr) : NULL,
1532 ndm->ndm_state,
1533 (override ? NEIGH_UPDATE_F_OVERRIDE : 0) |
1534 NEIGH_UPDATE_F_ADMIN);
1535
1536 neigh_release(n);
1537 goto out_dev_put;
1538 }
1539
1540 read_unlock(&neigh_tbl_lock);
1541 err = -EADDRNOTAVAIL;
1542out_dev_put:
1543 if (dev)
1544 dev_put(dev);
1545out:
1546 return err;
1547}
1548
1549
1550static int neigh_fill_info(struct sk_buff *skb, struct neighbour *n,
1551 u32 pid, u32 seq, int event)
1552{
1553 unsigned long now = jiffies;
1554 unsigned char *b = skb->tail;
1555 struct nda_cacheinfo ci;
1556 int locked = 0;
1557 u32 probes;
1558 struct nlmsghdr *nlh = NLMSG_PUT(skb, pid, seq, event,
1559 sizeof(struct ndmsg));
1560 struct ndmsg *ndm = NLMSG_DATA(nlh);
1561
1562 nlh->nlmsg_flags = pid ? NLM_F_MULTI : 0;
1563 ndm->ndm_family = n->ops->family;
1564 ndm->ndm_flags = n->flags;
1565 ndm->ndm_type = n->type;
1566 ndm->ndm_ifindex = n->dev->ifindex;
1567 RTA_PUT(skb, NDA_DST, n->tbl->key_len, n->primary_key);
1568 read_lock_bh(&n->lock);
1569 locked = 1;
1570 ndm->ndm_state = n->nud_state;
1571 if (n->nud_state & NUD_VALID)
1572 RTA_PUT(skb, NDA_LLADDR, n->dev->addr_len, n->ha);
1573 ci.ndm_used = now - n->used;
1574 ci.ndm_confirmed = now - n->confirmed;
1575 ci.ndm_updated = now - n->updated;
1576 ci.ndm_refcnt = atomic_read(&n->refcnt) - 1;
1577 probes = atomic_read(&n->probes);
1578 read_unlock_bh(&n->lock);
1579 locked = 0;
1580 RTA_PUT(skb, NDA_CACHEINFO, sizeof(ci), &ci);
1581 RTA_PUT(skb, NDA_PROBES, sizeof(probes), &probes);
1582 nlh->nlmsg_len = skb->tail - b;
1583 return skb->len;
1584
1585nlmsg_failure:
1586rtattr_failure:
1587 if (locked)
1588 read_unlock_bh(&n->lock);
1589 skb_trim(skb, b - skb->data);
1590 return -1;
1591}
1592
1593
1594static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
1595 struct netlink_callback *cb)
1596{
1597 struct neighbour *n;
1598 int rc, h, s_h = cb->args[1];
1599 int idx, s_idx = idx = cb->args[2];
1600
1601 for (h = 0; h <= tbl->hash_mask; h++) {
1602 if (h < s_h)
1603 continue;
1604 if (h > s_h)
1605 s_idx = 0;
1606 read_lock_bh(&tbl->lock);
1607 for (n = tbl->hash_buckets[h], idx = 0; n; n = n->next, idx++) {
1608 if (idx < s_idx)
1609 continue;
1610 if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid,
1611 cb->nlh->nlmsg_seq,
1612 RTM_NEWNEIGH) <= 0) {
1613 read_unlock_bh(&tbl->lock);
1614 rc = -1;
1615 goto out;
1616 }
1617 }
1618 read_unlock_bh(&tbl->lock);
1619 }
1620 rc = skb->len;
1621out:
1622 cb->args[1] = h;
1623 cb->args[2] = idx;
1624 return rc;
1625}
1626
1627int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
1628{
1629 struct neigh_table *tbl;
1630 int t, family, s_t;
1631
1632 read_lock(&neigh_tbl_lock);
1633 family = ((struct rtgenmsg *)NLMSG_DATA(cb->nlh))->rtgen_family;
1634 s_t = cb->args[0];
1635
1636 for (tbl = neigh_tables, t = 0; tbl; tbl = tbl->next, t++) {
1637 if (t < s_t || (family && tbl->family != family))
1638 continue;
1639 if (t > s_t)
1640 memset(&cb->args[1], 0, sizeof(cb->args) -
1641 sizeof(cb->args[0]));
1642 if (neigh_dump_table(tbl, skb, cb) < 0)
1643 break;
1644 }
1645 read_unlock(&neigh_tbl_lock);
1646
1647 cb->args[0] = t;
1648 return skb->len;
1649}
1650
1651void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie)
1652{
1653 int chain;
1654
1655 read_lock_bh(&tbl->lock);
1656 for (chain = 0; chain <= tbl->hash_mask; chain++) {
1657 struct neighbour *n;
1658
1659 for (n = tbl->hash_buckets[chain]; n; n = n->next)
1660 cb(n, cookie);
1661 }
1662 read_unlock_bh(&tbl->lock);
1663}
1664EXPORT_SYMBOL(neigh_for_each);
1665
1666/* The tbl->lock must be held as a writer and BH disabled. */
1667void __neigh_for_each_release(struct neigh_table *tbl,
1668 int (*cb)(struct neighbour *))
1669{
1670 int chain;
1671
1672 for (chain = 0; chain <= tbl->hash_mask; chain++) {
1673 struct neighbour *n, **np;
1674
1675 np = &tbl->hash_buckets[chain];
1676 while ((n = *np) != NULL) {
1677 int release;
1678
1679 write_lock(&n->lock);
1680 release = cb(n);
1681 if (release) {
1682 *np = n->next;
1683 n->dead = 1;
1684 } else
1685 np = &n->next;
1686 write_unlock(&n->lock);
1687 if (release)
1688 neigh_release(n);
1689 }
1690 }
1691}
1692EXPORT_SYMBOL(__neigh_for_each_release);
1693
1694#ifdef CONFIG_PROC_FS
1695
1696static struct neighbour *neigh_get_first(struct seq_file *seq)
1697{
1698 struct neigh_seq_state *state = seq->private;
1699 struct neigh_table *tbl = state->tbl;
1700 struct neighbour *n = NULL;
1701 int bucket = state->bucket;
1702
1703 state->flags &= ~NEIGH_SEQ_IS_PNEIGH;
1704 for (bucket = 0; bucket <= tbl->hash_mask; bucket++) {
1705 n = tbl->hash_buckets[bucket];
1706
1707 while (n) {
1708 if (state->neigh_sub_iter) {
1709 loff_t fakep = 0;
1710 void *v;
1711
1712 v = state->neigh_sub_iter(state, n, &fakep);
1713 if (!v)
1714 goto next;
1715 }
1716 if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
1717 break;
1718 if (n->nud_state & ~NUD_NOARP)
1719 break;
1720 next:
1721 n = n->next;
1722 }
1723
1724 if (n)
1725 break;
1726 }
1727 state->bucket = bucket;
1728
1729 return n;
1730}
1731
1732static struct neighbour *neigh_get_next(struct seq_file *seq,
1733 struct neighbour *n,
1734 loff_t *pos)
1735{
1736 struct neigh_seq_state *state = seq->private;
1737 struct neigh_table *tbl = state->tbl;
1738
1739 if (state->neigh_sub_iter) {
1740 void *v = state->neigh_sub_iter(state, n, pos);
1741 if (v)
1742 return n;
1743 }
1744 n = n->next;
1745
1746 while (1) {
1747 while (n) {
1748 if (state->neigh_sub_iter) {
1749 void *v = state->neigh_sub_iter(state, n, pos);
1750 if (v)
1751 return n;
1752 goto next;
1753 }
1754 if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
1755 break;
1756
1757 if (n->nud_state & ~NUD_NOARP)
1758 break;
1759 next:
1760 n = n->next;
1761 }
1762
1763 if (n)
1764 break;
1765
1766 if (++state->bucket > tbl->hash_mask)
1767 break;
1768
1769 n = tbl->hash_buckets[state->bucket];
1770 }
1771
1772 if (n && pos)
1773 --(*pos);
1774 return n;
1775}
1776
1777static struct neighbour *neigh_get_idx(struct seq_file *seq, loff_t *pos)
1778{
1779 struct neighbour *n = neigh_get_first(seq);
1780
1781 if (n) {
1782 while (*pos) {
1783 n = neigh_get_next(seq, n, pos);
1784 if (!n)
1785 break;
1786 }
1787 }
1788 return *pos ? NULL : n;
1789}
1790
1791static struct pneigh_entry *pneigh_get_first(struct seq_file *seq)
1792{
1793 struct neigh_seq_state *state = seq->private;
1794 struct neigh_table *tbl = state->tbl;
1795 struct pneigh_entry *pn = NULL;
1796 int bucket = state->bucket;
1797
1798 state->flags |= NEIGH_SEQ_IS_PNEIGH;
1799 for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) {
1800 pn = tbl->phash_buckets[bucket];
1801 if (pn)
1802 break;
1803 }
1804 state->bucket = bucket;
1805
1806 return pn;
1807}
1808
1809static struct pneigh_entry *pneigh_get_next(struct seq_file *seq,
1810 struct pneigh_entry *pn,
1811 loff_t *pos)
1812{
1813 struct neigh_seq_state *state = seq->private;
1814 struct neigh_table *tbl = state->tbl;
1815
1816 pn = pn->next;
1817 while (!pn) {
1818 if (++state->bucket > PNEIGH_HASHMASK)
1819 break;
1820 pn = tbl->phash_buckets[state->bucket];
1821 if (pn)
1822 break;
1823 }
1824
1825 if (pn && pos)
1826 --(*pos);
1827
1828 return pn;
1829}
1830
1831static struct pneigh_entry *pneigh_get_idx(struct seq_file *seq, loff_t *pos)
1832{
1833 struct pneigh_entry *pn = pneigh_get_first(seq);
1834
1835 if (pn) {
1836 while (*pos) {
1837 pn = pneigh_get_next(seq, pn, pos);
1838 if (!pn)
1839 break;
1840 }
1841 }
1842 return *pos ? NULL : pn;
1843}
1844
1845static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos)
1846{
1847 struct neigh_seq_state *state = seq->private;
1848 void *rc;
1849
1850 rc = neigh_get_idx(seq, pos);
1851 if (!rc && !(state->flags & NEIGH_SEQ_NEIGH_ONLY))
1852 rc = pneigh_get_idx(seq, pos);
1853
1854 return rc;
1855}
1856
1857void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags)
1858{
1859 struct neigh_seq_state *state = seq->private;
1860 loff_t pos_minus_one;
1861
1862 state->tbl = tbl;
1863 state->bucket = 0;
1864 state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH);
1865
1866 read_lock_bh(&tbl->lock);
1867
1868 pos_minus_one = *pos - 1;
1869 return *pos ? neigh_get_idx_any(seq, &pos_minus_one) : SEQ_START_TOKEN;
1870}
1871EXPORT_SYMBOL(neigh_seq_start);
1872
1873void *neigh_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1874{
1875 struct neigh_seq_state *state;
1876 void *rc;
1877
1878 if (v == SEQ_START_TOKEN) {
1879 rc = neigh_get_idx(seq, pos);
1880 goto out;
1881 }
1882
1883 state = seq->private;
1884 if (!(state->flags & NEIGH_SEQ_IS_PNEIGH)) {
1885 rc = neigh_get_next(seq, v, NULL);
1886 if (rc)
1887 goto out;
1888 if (!(state->flags & NEIGH_SEQ_NEIGH_ONLY))
1889 rc = pneigh_get_first(seq);
1890 } else {
1891 BUG_ON(state->flags & NEIGH_SEQ_NEIGH_ONLY);
1892 rc = pneigh_get_next(seq, v, NULL);
1893 }
1894out:
1895 ++(*pos);
1896 return rc;
1897}
1898EXPORT_SYMBOL(neigh_seq_next);
1899
1900void neigh_seq_stop(struct seq_file *seq, void *v)
1901{
1902 struct neigh_seq_state *state = seq->private;
1903 struct neigh_table *tbl = state->tbl;
1904
1905 read_unlock_bh(&tbl->lock);
1906}
1907EXPORT_SYMBOL(neigh_seq_stop);
1908
1909/* statistics via seq_file */
1910
1911static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos)
1912{
1913 struct proc_dir_entry *pde = seq->private;
1914 struct neigh_table *tbl = pde->data;
1915 int cpu;
1916
1917 if (*pos == 0)
1918 return SEQ_START_TOKEN;
1919
1920 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
1921 if (!cpu_possible(cpu))
1922 continue;
1923 *pos = cpu+1;
1924 return per_cpu_ptr(tbl->stats, cpu);
1925 }
1926 return NULL;
1927}
1928
1929static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1930{
1931 struct proc_dir_entry *pde = seq->private;
1932 struct neigh_table *tbl = pde->data;
1933 int cpu;
1934
1935 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
1936 if (!cpu_possible(cpu))
1937 continue;
1938 *pos = cpu+1;
1939 return per_cpu_ptr(tbl->stats, cpu);
1940 }
1941 return NULL;
1942}
1943
1944static void neigh_stat_seq_stop(struct seq_file *seq, void *v)
1945{
1946
1947}
1948
1949static int neigh_stat_seq_show(struct seq_file *seq, void *v)
1950{
1951 struct proc_dir_entry *pde = seq->private;
1952 struct neigh_table *tbl = pde->data;
1953 struct neigh_statistics *st = v;
1954
1955 if (v == SEQ_START_TOKEN) {
1956 seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs forced_gc_goal_miss\n");
1957 return 0;
1958 }
1959
1960 seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx "
1961 "%08lx %08lx %08lx %08lx\n",
1962 atomic_read(&tbl->entries),
1963
1964 st->allocs,
1965 st->destroys,
1966 st->hash_grows,
1967
1968 st->lookups,
1969 st->hits,
1970
1971 st->res_failed,
1972
1973 st->rcv_probes_mcast,
1974 st->rcv_probes_ucast,
1975
1976 st->periodic_gc_runs,
1977 st->forced_gc_runs
1978 );
1979
1980 return 0;
1981}
1982
1983static struct seq_operations neigh_stat_seq_ops = {
1984 .start = neigh_stat_seq_start,
1985 .next = neigh_stat_seq_next,
1986 .stop = neigh_stat_seq_stop,
1987 .show = neigh_stat_seq_show,
1988};
1989
1990static int neigh_stat_seq_open(struct inode *inode, struct file *file)
1991{
1992 int ret = seq_open(file, &neigh_stat_seq_ops);
1993
1994 if (!ret) {
1995 struct seq_file *sf = file->private_data;
1996 sf->private = PDE(inode);
1997 }
1998 return ret;
1999};
2000
2001static struct file_operations neigh_stat_seq_fops = {
2002 .owner = THIS_MODULE,
2003 .open = neigh_stat_seq_open,
2004 .read = seq_read,
2005 .llseek = seq_lseek,
2006 .release = seq_release,
2007};
2008
2009#endif /* CONFIG_PROC_FS */
2010
2011#ifdef CONFIG_ARPD
2012void neigh_app_ns(struct neighbour *n)
2013{
2014 struct nlmsghdr *nlh;
2015 int size = NLMSG_SPACE(sizeof(struct ndmsg) + 256);
2016 struct sk_buff *skb = alloc_skb(size, GFP_ATOMIC);
2017
2018 if (!skb)
2019 return;
2020
2021 if (neigh_fill_info(skb, n, 0, 0, RTM_GETNEIGH) < 0) {
2022 kfree_skb(skb);
2023 return;
2024 }
2025 nlh = (struct nlmsghdr *)skb->data;
2026 nlh->nlmsg_flags = NLM_F_REQUEST;
2027 NETLINK_CB(skb).dst_groups = RTMGRP_NEIGH;
2028 netlink_broadcast(rtnl, skb, 0, RTMGRP_NEIGH, GFP_ATOMIC);
2029}
2030
2031static void neigh_app_notify(struct neighbour *n)
2032{
2033 struct nlmsghdr *nlh;
2034 int size = NLMSG_SPACE(sizeof(struct ndmsg) + 256);
2035 struct sk_buff *skb = alloc_skb(size, GFP_ATOMIC);
2036
2037 if (!skb)
2038 return;
2039
2040 if (neigh_fill_info(skb, n, 0, 0, RTM_NEWNEIGH) < 0) {
2041 kfree_skb(skb);
2042 return;
2043 }
2044 nlh = (struct nlmsghdr *)skb->data;
2045 NETLINK_CB(skb).dst_groups = RTMGRP_NEIGH;
2046 netlink_broadcast(rtnl, skb, 0, RTMGRP_NEIGH, GFP_ATOMIC);
2047}
2048
2049#endif /* CONFIG_ARPD */
2050
2051#ifdef CONFIG_SYSCTL
2052
2053static struct neigh_sysctl_table {
2054 struct ctl_table_header *sysctl_header;
2055 ctl_table neigh_vars[__NET_NEIGH_MAX];
2056 ctl_table neigh_dev[2];
2057 ctl_table neigh_neigh_dir[2];
2058 ctl_table neigh_proto_dir[2];
2059 ctl_table neigh_root_dir[2];
2060} neigh_sysctl_template = {
2061 .neigh_vars = {
2062 {
2063 .ctl_name = NET_NEIGH_MCAST_SOLICIT,
2064 .procname = "mcast_solicit",
2065 .maxlen = sizeof(int),
2066 .mode = 0644,
2067 .proc_handler = &proc_dointvec,
2068 },
2069 {
2070 .ctl_name = NET_NEIGH_UCAST_SOLICIT,
2071 .procname = "ucast_solicit",
2072 .maxlen = sizeof(int),
2073 .mode = 0644,
2074 .proc_handler = &proc_dointvec,
2075 },
2076 {
2077 .ctl_name = NET_NEIGH_APP_SOLICIT,
2078 .procname = "app_solicit",
2079 .maxlen = sizeof(int),
2080 .mode = 0644,
2081 .proc_handler = &proc_dointvec,
2082 },
2083 {
2084 .ctl_name = NET_NEIGH_RETRANS_TIME,
2085 .procname = "retrans_time",
2086 .maxlen = sizeof(int),
2087 .mode = 0644,
2088 .proc_handler = &proc_dointvec_userhz_jiffies,
2089 },
2090 {
2091 .ctl_name = NET_NEIGH_REACHABLE_TIME,
2092 .procname = "base_reachable_time",
2093 .maxlen = sizeof(int),
2094 .mode = 0644,
2095 .proc_handler = &proc_dointvec_jiffies,
2096 .strategy = &sysctl_jiffies,
2097 },
2098 {
2099 .ctl_name = NET_NEIGH_DELAY_PROBE_TIME,
2100 .procname = "delay_first_probe_time",
2101 .maxlen = sizeof(int),
2102 .mode = 0644,
2103 .proc_handler = &proc_dointvec_jiffies,
2104 .strategy = &sysctl_jiffies,
2105 },
2106 {
2107 .ctl_name = NET_NEIGH_GC_STALE_TIME,
2108 .procname = "gc_stale_time",
2109 .maxlen = sizeof(int),
2110 .mode = 0644,
2111 .proc_handler = &proc_dointvec_jiffies,
2112 .strategy = &sysctl_jiffies,
2113 },
2114 {
2115 .ctl_name = NET_NEIGH_UNRES_QLEN,
2116 .procname = "unres_qlen",
2117 .maxlen = sizeof(int),
2118 .mode = 0644,
2119 .proc_handler = &proc_dointvec,
2120 },
2121 {
2122 .ctl_name = NET_NEIGH_PROXY_QLEN,
2123 .procname = "proxy_qlen",
2124 .maxlen = sizeof(int),
2125 .mode = 0644,
2126 .proc_handler = &proc_dointvec,
2127 },
2128 {
2129 .ctl_name = NET_NEIGH_ANYCAST_DELAY,
2130 .procname = "anycast_delay",
2131 .maxlen = sizeof(int),
2132 .mode = 0644,
2133 .proc_handler = &proc_dointvec_userhz_jiffies,
2134 },
2135 {
2136 .ctl_name = NET_NEIGH_PROXY_DELAY,
2137 .procname = "proxy_delay",
2138 .maxlen = sizeof(int),
2139 .mode = 0644,
2140 .proc_handler = &proc_dointvec_userhz_jiffies,
2141 },
2142 {
2143 .ctl_name = NET_NEIGH_LOCKTIME,
2144 .procname = "locktime",
2145 .maxlen = sizeof(int),
2146 .mode = 0644,
2147 .proc_handler = &proc_dointvec_userhz_jiffies,
2148 },
2149 {
2150 .ctl_name = NET_NEIGH_GC_INTERVAL,
2151 .procname = "gc_interval",
2152 .maxlen = sizeof(int),
2153 .mode = 0644,
2154 .proc_handler = &proc_dointvec_jiffies,
2155 .strategy = &sysctl_jiffies,
2156 },
2157 {
2158 .ctl_name = NET_NEIGH_GC_THRESH1,
2159 .procname = "gc_thresh1",
2160 .maxlen = sizeof(int),
2161 .mode = 0644,
2162 .proc_handler = &proc_dointvec,
2163 },
2164 {
2165 .ctl_name = NET_NEIGH_GC_THRESH2,
2166 .procname = "gc_thresh2",
2167 .maxlen = sizeof(int),
2168 .mode = 0644,
2169 .proc_handler = &proc_dointvec,
2170 },
2171 {
2172 .ctl_name = NET_NEIGH_GC_THRESH3,
2173 .procname = "gc_thresh3",
2174 .maxlen = sizeof(int),
2175 .mode = 0644,
2176 .proc_handler = &proc_dointvec,
2177 },
2178 {
2179 .ctl_name = NET_NEIGH_RETRANS_TIME_MS,
2180 .procname = "retrans_time_ms",
2181 .maxlen = sizeof(int),
2182 .mode = 0644,
2183 .proc_handler = &proc_dointvec_ms_jiffies,
2184 .strategy = &sysctl_ms_jiffies,
2185 },
2186 {
2187 .ctl_name = NET_NEIGH_REACHABLE_TIME_MS,
2188 .procname = "base_reachable_time_ms",
2189 .maxlen = sizeof(int),
2190 .mode = 0644,
2191 .proc_handler = &proc_dointvec_ms_jiffies,
2192 .strategy = &sysctl_ms_jiffies,
2193 },
2194 },
2195 .neigh_dev = {
2196 {
2197 .ctl_name = NET_PROTO_CONF_DEFAULT,
2198 .procname = "default",
2199 .mode = 0555,
2200 },
2201 },
2202 .neigh_neigh_dir = {
2203 {
2204 .procname = "neigh",
2205 .mode = 0555,
2206 },
2207 },
2208 .neigh_proto_dir = {
2209 {
2210 .mode = 0555,
2211 },
2212 },
2213 .neigh_root_dir = {
2214 {
2215 .ctl_name = CTL_NET,
2216 .procname = "net",
2217 .mode = 0555,
2218 },
2219 },
2220};
2221
2222int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
2223 int p_id, int pdev_id, char *p_name,
2224 proc_handler *handler, ctl_handler *strategy)
2225{
2226 struct neigh_sysctl_table *t = kmalloc(sizeof(*t), GFP_KERNEL);
2227 const char *dev_name_source = NULL;
2228 char *dev_name = NULL;
2229 int err = 0;
2230
2231 if (!t)
2232 return -ENOBUFS;
2233 memcpy(t, &neigh_sysctl_template, sizeof(*t));
2234 t->neigh_vars[0].data = &p->mcast_probes;
2235 t->neigh_vars[1].data = &p->ucast_probes;
2236 t->neigh_vars[2].data = &p->app_probes;
2237 t->neigh_vars[3].data = &p->retrans_time;
2238 t->neigh_vars[4].data = &p->base_reachable_time;
2239 t->neigh_vars[5].data = &p->delay_probe_time;
2240 t->neigh_vars[6].data = &p->gc_staletime;
2241 t->neigh_vars[7].data = &p->queue_len;
2242 t->neigh_vars[8].data = &p->proxy_qlen;
2243 t->neigh_vars[9].data = &p->anycast_delay;
2244 t->neigh_vars[10].data = &p->proxy_delay;
2245 t->neigh_vars[11].data = &p->locktime;
2246
2247 if (dev) {
2248 dev_name_source = dev->name;
2249 t->neigh_dev[0].ctl_name = dev->ifindex;
2250 t->neigh_vars[12].procname = NULL;
2251 t->neigh_vars[13].procname = NULL;
2252 t->neigh_vars[14].procname = NULL;
2253 t->neigh_vars[15].procname = NULL;
2254 } else {
2255 dev_name_source = t->neigh_dev[0].procname;
2256 t->neigh_vars[12].data = (int *)(p + 1);
2257 t->neigh_vars[13].data = (int *)(p + 1) + 1;
2258 t->neigh_vars[14].data = (int *)(p + 1) + 2;
2259 t->neigh_vars[15].data = (int *)(p + 1) + 3;
2260 }
2261
2262 t->neigh_vars[16].data = &p->retrans_time;
2263 t->neigh_vars[17].data = &p->base_reachable_time;
2264
2265 if (handler || strategy) {
2266 /* RetransTime */
2267 t->neigh_vars[3].proc_handler = handler;
2268 t->neigh_vars[3].strategy = strategy;
2269 t->neigh_vars[3].extra1 = dev;
2270 /* ReachableTime */
2271 t->neigh_vars[4].proc_handler = handler;
2272 t->neigh_vars[4].strategy = strategy;
2273 t->neigh_vars[4].extra1 = dev;
2274 /* RetransTime (in milliseconds)*/
2275 t->neigh_vars[16].proc_handler = handler;
2276 t->neigh_vars[16].strategy = strategy;
2277 t->neigh_vars[16].extra1 = dev;
2278 /* ReachableTime (in milliseconds) */
2279 t->neigh_vars[17].proc_handler = handler;
2280 t->neigh_vars[17].strategy = strategy;
2281 t->neigh_vars[17].extra1 = dev;
2282 }
2283
2284 dev_name = net_sysctl_strdup(dev_name_source);
2285 if (!dev_name) {
2286 err = -ENOBUFS;
2287 goto free;
2288 }
2289
2290 t->neigh_dev[0].procname = dev_name;
2291
2292 t->neigh_neigh_dir[0].ctl_name = pdev_id;
2293
2294 t->neigh_proto_dir[0].procname = p_name;
2295 t->neigh_proto_dir[0].ctl_name = p_id;
2296
2297 t->neigh_dev[0].child = t->neigh_vars;
2298 t->neigh_neigh_dir[0].child = t->neigh_dev;
2299 t->neigh_proto_dir[0].child = t->neigh_neigh_dir;
2300 t->neigh_root_dir[0].child = t->neigh_proto_dir;
2301
2302 t->sysctl_header = register_sysctl_table(t->neigh_root_dir, 0);
2303 if (!t->sysctl_header) {
2304 err = -ENOBUFS;
2305 goto free_procname;
2306 }
2307 p->sysctl_table = t;
2308 return 0;
2309
2310 /* error path */
2311 free_procname:
2312 kfree(dev_name);
2313 free:
2314 kfree(t);
2315
2316 return err;
2317}
2318
2319void neigh_sysctl_unregister(struct neigh_parms *p)
2320{
2321 if (p->sysctl_table) {
2322 struct neigh_sysctl_table *t = p->sysctl_table;
2323 p->sysctl_table = NULL;
2324 unregister_sysctl_table(t->sysctl_header);
2325 kfree(t->neigh_dev[0].procname);
2326 kfree(t);
2327 }
2328}
2329
2330#endif /* CONFIG_SYSCTL */
2331
2332EXPORT_SYMBOL(__neigh_event_send);
2333EXPORT_SYMBOL(neigh_add);
2334EXPORT_SYMBOL(neigh_changeaddr);
2335EXPORT_SYMBOL(neigh_compat_output);
2336EXPORT_SYMBOL(neigh_connected_output);
2337EXPORT_SYMBOL(neigh_create);
2338EXPORT_SYMBOL(neigh_delete);
2339EXPORT_SYMBOL(neigh_destroy);
2340EXPORT_SYMBOL(neigh_dump_info);
2341EXPORT_SYMBOL(neigh_event_ns);
2342EXPORT_SYMBOL(neigh_ifdown);
2343EXPORT_SYMBOL(neigh_lookup);
2344EXPORT_SYMBOL(neigh_lookup_nodev);
2345EXPORT_SYMBOL(neigh_parms_alloc);
2346EXPORT_SYMBOL(neigh_parms_release);
2347EXPORT_SYMBOL(neigh_rand_reach_time);
2348EXPORT_SYMBOL(neigh_resolve_output);
2349EXPORT_SYMBOL(neigh_table_clear);
2350EXPORT_SYMBOL(neigh_table_init);
2351EXPORT_SYMBOL(neigh_update);
2352EXPORT_SYMBOL(neigh_update_hhs);
2353EXPORT_SYMBOL(pneigh_enqueue);
2354EXPORT_SYMBOL(pneigh_lookup);
2355
2356#ifdef CONFIG_ARPD
2357EXPORT_SYMBOL(neigh_app_ns);
2358#endif
2359#ifdef CONFIG_SYSCTL
2360EXPORT_SYMBOL(neigh_sysctl_register);
2361EXPORT_SYMBOL(neigh_sysctl_unregister);
2362#endif
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
new file mode 100644
index 000000000000..060f703659e8
--- /dev/null
+++ b/net/core/net-sysfs.c
@@ -0,0 +1,461 @@
1/*
2 * net-sysfs.c - network device class and attributes
3 *
4 * Copyright (c) 2003 Stephen Hemminger <shemminger@osdl.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/config.h>
13#include <linux/kernel.h>
14#include <linux/netdevice.h>
15#include <linux/if_arp.h>
16#include <net/sock.h>
17#include <linux/rtnetlink.h>
18#include <linux/wireless.h>
19
20#define to_class_dev(obj) container_of(obj,struct class_device,kobj)
21#define to_net_dev(class) container_of(class, struct net_device, class_dev)
22
23static const char fmt_hex[] = "%#x\n";
24static const char fmt_dec[] = "%d\n";
25static const char fmt_ulong[] = "%lu\n";
26
27static inline int dev_isalive(const struct net_device *dev)
28{
29 return dev->reg_state == NETREG_REGISTERED;
30}
31
32/* use same locking rules as GIF* ioctl's */
33static ssize_t netdev_show(const struct class_device *cd, char *buf,
34 ssize_t (*format)(const struct net_device *, char *))
35{
36 struct net_device *net = to_net_dev(cd);
37 ssize_t ret = -EINVAL;
38
39 read_lock(&dev_base_lock);
40 if (dev_isalive(net))
41 ret = (*format)(net, buf);
42 read_unlock(&dev_base_lock);
43
44 return ret;
45}
46
47/* generate a show function for simple field */
48#define NETDEVICE_SHOW(field, format_string) \
49static ssize_t format_##field(const struct net_device *net, char *buf) \
50{ \
51 return sprintf(buf, format_string, net->field); \
52} \
53static ssize_t show_##field(struct class_device *cd, char *buf) \
54{ \
55 return netdev_show(cd, buf, format_##field); \
56}
57
58
59/* use same locking and permission rules as SIF* ioctl's */
60static ssize_t netdev_store(struct class_device *dev,
61 const char *buf, size_t len,
62 int (*set)(struct net_device *, unsigned long))
63{
64 struct net_device *net = to_net_dev(dev);
65 char *endp;
66 unsigned long new;
67 int ret = -EINVAL;
68
69 if (!capable(CAP_NET_ADMIN))
70 return -EPERM;
71
72 new = simple_strtoul(buf, &endp, 0);
73 if (endp == buf)
74 goto err;
75
76 rtnl_lock();
77 if (dev_isalive(net)) {
78 if ((ret = (*set)(net, new)) == 0)
79 ret = len;
80 }
81 rtnl_unlock();
82 err:
83 return ret;
84}
85
86/* generate a read-only network device class attribute */
87#define NETDEVICE_ATTR(field, format_string) \
88NETDEVICE_SHOW(field, format_string) \
89static CLASS_DEVICE_ATTR(field, S_IRUGO, show_##field, NULL) \
90
91NETDEVICE_ATTR(addr_len, fmt_dec);
92NETDEVICE_ATTR(iflink, fmt_dec);
93NETDEVICE_ATTR(ifindex, fmt_dec);
94NETDEVICE_ATTR(features, fmt_hex);
95NETDEVICE_ATTR(type, fmt_dec);
96
97/* use same locking rules as GIFHWADDR ioctl's */
98static ssize_t format_addr(char *buf, const unsigned char *addr, int len)
99{
100 int i;
101 char *cp = buf;
102
103 for (i = 0; i < len; i++)
104 cp += sprintf(cp, "%02x%c", addr[i],
105 i == (len - 1) ? '\n' : ':');
106 return cp - buf;
107}
108
109static ssize_t show_address(struct class_device *dev, char *buf)
110{
111 struct net_device *net = to_net_dev(dev);
112 ssize_t ret = -EINVAL;
113
114 read_lock(&dev_base_lock);
115 if (dev_isalive(net))
116 ret = format_addr(buf, net->dev_addr, net->addr_len);
117 read_unlock(&dev_base_lock);
118 return ret;
119}
120
121static ssize_t show_broadcast(struct class_device *dev, char *buf)
122{
123 struct net_device *net = to_net_dev(dev);
124 if (dev_isalive(net))
125 return format_addr(buf, net->broadcast, net->addr_len);
126 return -EINVAL;
127}
128
129static ssize_t show_carrier(struct class_device *dev, char *buf)
130{
131 struct net_device *netdev = to_net_dev(dev);
132 if (netif_running(netdev)) {
133 return sprintf(buf, fmt_dec, !!netif_carrier_ok(netdev));
134 }
135 return -EINVAL;
136}
137
138static CLASS_DEVICE_ATTR(address, S_IRUGO, show_address, NULL);
139static CLASS_DEVICE_ATTR(broadcast, S_IRUGO, show_broadcast, NULL);
140static CLASS_DEVICE_ATTR(carrier, S_IRUGO, show_carrier, NULL);
141
142/* read-write attributes */
143NETDEVICE_SHOW(mtu, fmt_dec);
144
145static int change_mtu(struct net_device *net, unsigned long new_mtu)
146{
147 return dev_set_mtu(net, (int) new_mtu);
148}
149
150static ssize_t store_mtu(struct class_device *dev, const char *buf, size_t len)
151{
152 return netdev_store(dev, buf, len, change_mtu);
153}
154
155static CLASS_DEVICE_ATTR(mtu, S_IRUGO | S_IWUSR, show_mtu, store_mtu);
156
157NETDEVICE_SHOW(flags, fmt_hex);
158
159static int change_flags(struct net_device *net, unsigned long new_flags)
160{
161 return dev_change_flags(net, (unsigned) new_flags);
162}
163
164static ssize_t store_flags(struct class_device *dev, const char *buf, size_t len)
165{
166 return netdev_store(dev, buf, len, change_flags);
167}
168
169static CLASS_DEVICE_ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags);
170
171NETDEVICE_SHOW(tx_queue_len, fmt_ulong);
172
173static int change_tx_queue_len(struct net_device *net, unsigned long new_len)
174{
175 net->tx_queue_len = new_len;
176 return 0;
177}
178
179static ssize_t store_tx_queue_len(struct class_device *dev, const char *buf, size_t len)
180{
181 return netdev_store(dev, buf, len, change_tx_queue_len);
182}
183
184static CLASS_DEVICE_ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len,
185 store_tx_queue_len);
186
187
188static struct class_device_attribute *net_class_attributes[] = {
189 &class_device_attr_ifindex,
190 &class_device_attr_iflink,
191 &class_device_attr_addr_len,
192 &class_device_attr_tx_queue_len,
193 &class_device_attr_features,
194 &class_device_attr_mtu,
195 &class_device_attr_flags,
196 &class_device_attr_type,
197 &class_device_attr_address,
198 &class_device_attr_broadcast,
199 &class_device_attr_carrier,
200 NULL
201};
202
203/* Show a given an attribute in the statistics group */
204static ssize_t netstat_show(const struct class_device *cd, char *buf,
205 unsigned long offset)
206{
207 struct net_device *dev = to_net_dev(cd);
208 struct net_device_stats *stats;
209 ssize_t ret = -EINVAL;
210
211 if (offset > sizeof(struct net_device_stats) ||
212 offset % sizeof(unsigned long) != 0)
213 WARN_ON(1);
214
215 read_lock(&dev_base_lock);
216 if (dev_isalive(dev) && dev->get_stats &&
217 (stats = (*dev->get_stats)(dev)))
218 ret = sprintf(buf, fmt_ulong,
219 *(unsigned long *)(((u8 *) stats) + offset));
220
221 read_unlock(&dev_base_lock);
222 return ret;
223}
224
225/* generate a read-only statistics attribute */
226#define NETSTAT_ENTRY(name) \
227static ssize_t show_##name(struct class_device *cd, char *buf) \
228{ \
229 return netstat_show(cd, buf, \
230 offsetof(struct net_device_stats, name)); \
231} \
232static CLASS_DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
233
234NETSTAT_ENTRY(rx_packets);
235NETSTAT_ENTRY(tx_packets);
236NETSTAT_ENTRY(rx_bytes);
237NETSTAT_ENTRY(tx_bytes);
238NETSTAT_ENTRY(rx_errors);
239NETSTAT_ENTRY(tx_errors);
240NETSTAT_ENTRY(rx_dropped);
241NETSTAT_ENTRY(tx_dropped);
242NETSTAT_ENTRY(multicast);
243NETSTAT_ENTRY(collisions);
244NETSTAT_ENTRY(rx_length_errors);
245NETSTAT_ENTRY(rx_over_errors);
246NETSTAT_ENTRY(rx_crc_errors);
247NETSTAT_ENTRY(rx_frame_errors);
248NETSTAT_ENTRY(rx_fifo_errors);
249NETSTAT_ENTRY(rx_missed_errors);
250NETSTAT_ENTRY(tx_aborted_errors);
251NETSTAT_ENTRY(tx_carrier_errors);
252NETSTAT_ENTRY(tx_fifo_errors);
253NETSTAT_ENTRY(tx_heartbeat_errors);
254NETSTAT_ENTRY(tx_window_errors);
255NETSTAT_ENTRY(rx_compressed);
256NETSTAT_ENTRY(tx_compressed);
257
258static struct attribute *netstat_attrs[] = {
259 &class_device_attr_rx_packets.attr,
260 &class_device_attr_tx_packets.attr,
261 &class_device_attr_rx_bytes.attr,
262 &class_device_attr_tx_bytes.attr,
263 &class_device_attr_rx_errors.attr,
264 &class_device_attr_tx_errors.attr,
265 &class_device_attr_rx_dropped.attr,
266 &class_device_attr_tx_dropped.attr,
267 &class_device_attr_multicast.attr,
268 &class_device_attr_collisions.attr,
269 &class_device_attr_rx_length_errors.attr,
270 &class_device_attr_rx_over_errors.attr,
271 &class_device_attr_rx_crc_errors.attr,
272 &class_device_attr_rx_frame_errors.attr,
273 &class_device_attr_rx_fifo_errors.attr,
274 &class_device_attr_rx_missed_errors.attr,
275 &class_device_attr_tx_aborted_errors.attr,
276 &class_device_attr_tx_carrier_errors.attr,
277 &class_device_attr_tx_fifo_errors.attr,
278 &class_device_attr_tx_heartbeat_errors.attr,
279 &class_device_attr_tx_window_errors.attr,
280 &class_device_attr_rx_compressed.attr,
281 &class_device_attr_tx_compressed.attr,
282 NULL
283};
284
285
286static struct attribute_group netstat_group = {
287 .name = "statistics",
288 .attrs = netstat_attrs,
289};
290
291#ifdef WIRELESS_EXT
292/* helper function that does all the locking etc for wireless stats */
293static ssize_t wireless_show(struct class_device *cd, char *buf,
294 ssize_t (*format)(const struct iw_statistics *,
295 char *))
296{
297 struct net_device *dev = to_net_dev(cd);
298 const struct iw_statistics *iw;
299 ssize_t ret = -EINVAL;
300
301 read_lock(&dev_base_lock);
302 if (dev_isalive(dev) && dev->get_wireless_stats
303 && (iw = dev->get_wireless_stats(dev)) != NULL)
304 ret = (*format)(iw, buf);
305 read_unlock(&dev_base_lock);
306
307 return ret;
308}
309
310/* show function template for wireless fields */
311#define WIRELESS_SHOW(name, field, format_string) \
312static ssize_t format_iw_##name(const struct iw_statistics *iw, char *buf) \
313{ \
314 return sprintf(buf, format_string, iw->field); \
315} \
316static ssize_t show_iw_##name(struct class_device *cd, char *buf) \
317{ \
318 return wireless_show(cd, buf, format_iw_##name); \
319} \
320static CLASS_DEVICE_ATTR(name, S_IRUGO, show_iw_##name, NULL)
321
322WIRELESS_SHOW(status, status, fmt_hex);
323WIRELESS_SHOW(link, qual.qual, fmt_dec);
324WIRELESS_SHOW(level, qual.level, fmt_dec);
325WIRELESS_SHOW(noise, qual.noise, fmt_dec);
326WIRELESS_SHOW(nwid, discard.nwid, fmt_dec);
327WIRELESS_SHOW(crypt, discard.code, fmt_dec);
328WIRELESS_SHOW(fragment, discard.fragment, fmt_dec);
329WIRELESS_SHOW(misc, discard.misc, fmt_dec);
330WIRELESS_SHOW(retries, discard.retries, fmt_dec);
331WIRELESS_SHOW(beacon, miss.beacon, fmt_dec);
332
333static struct attribute *wireless_attrs[] = {
334 &class_device_attr_status.attr,
335 &class_device_attr_link.attr,
336 &class_device_attr_level.attr,
337 &class_device_attr_noise.attr,
338 &class_device_attr_nwid.attr,
339 &class_device_attr_crypt.attr,
340 &class_device_attr_fragment.attr,
341 &class_device_attr_retries.attr,
342 &class_device_attr_misc.attr,
343 &class_device_attr_beacon.attr,
344 NULL
345};
346
347static struct attribute_group wireless_group = {
348 .name = "wireless",
349 .attrs = wireless_attrs,
350};
351#endif
352
353#ifdef CONFIG_HOTPLUG
354static int netdev_hotplug(struct class_device *cd, char **envp,
355 int num_envp, char *buf, int size)
356{
357 struct net_device *dev = to_net_dev(cd);
358 int i = 0;
359 int n;
360
361 /* pass interface in env to hotplug. */
362 envp[i++] = buf;
363 n = snprintf(buf, size, "INTERFACE=%s", dev->name) + 1;
364 buf += n;
365 size -= n;
366
367 if ((size <= 0) || (i >= num_envp))
368 return -ENOMEM;
369
370 envp[i] = NULL;
371 return 0;
372}
373#endif
374
375/*
376 * netdev_release -- destroy and free a dead device.
377 * Called when last reference to class_device kobject is gone.
378 */
379static void netdev_release(struct class_device *cd)
380{
381 struct net_device *dev
382 = container_of(cd, struct net_device, class_dev);
383
384 BUG_ON(dev->reg_state != NETREG_RELEASED);
385
386 kfree((char *)dev - dev->padded);
387}
388
389static struct class net_class = {
390 .name = "net",
391 .release = netdev_release,
392#ifdef CONFIG_HOTPLUG
393 .hotplug = netdev_hotplug,
394#endif
395};
396
397void netdev_unregister_sysfs(struct net_device * net)
398{
399 struct class_device * class_dev = &(net->class_dev);
400
401 if (net->get_stats)
402 sysfs_remove_group(&class_dev->kobj, &netstat_group);
403
404#ifdef WIRELESS_EXT
405 if (net->get_wireless_stats)
406 sysfs_remove_group(&class_dev->kobj, &wireless_group);
407#endif
408 class_device_del(class_dev);
409
410}
411
412/* Create sysfs entries for network device. */
413int netdev_register_sysfs(struct net_device *net)
414{
415 struct class_device *class_dev = &(net->class_dev);
416 int i;
417 struct class_device_attribute *attr;
418 int ret;
419
420 class_dev->class = &net_class;
421 class_dev->class_data = net;
422
423 strlcpy(class_dev->class_id, net->name, BUS_ID_SIZE);
424 if ((ret = class_device_register(class_dev)))
425 goto out;
426
427 for (i = 0; (attr = net_class_attributes[i]) != NULL; i++) {
428 if ((ret = class_device_create_file(class_dev, attr)))
429 goto out_unreg;
430 }
431
432
433 if (net->get_stats &&
434 (ret = sysfs_create_group(&class_dev->kobj, &netstat_group)))
435 goto out_unreg;
436
437#ifdef WIRELESS_EXT
438 if (net->get_wireless_stats &&
439 (ret = sysfs_create_group(&class_dev->kobj, &wireless_group)))
440 goto out_cleanup;
441
442 return 0;
443out_cleanup:
444 if (net->get_stats)
445 sysfs_remove_group(&class_dev->kobj, &netstat_group);
446#else
447 return 0;
448#endif
449
450out_unreg:
451 printk(KERN_WARNING "%s: sysfs attribute registration failed %d\n",
452 net->name, ret);
453 class_device_unregister(class_dev);
454out:
455 return ret;
456}
457
458int netdev_sysfs_init(void)
459{
460 return class_register(&net_class);
461}
diff --git a/net/core/netfilter.c b/net/core/netfilter.c
new file mode 100644
index 000000000000..e51cfa46950c
--- /dev/null
+++ b/net/core/netfilter.c
@@ -0,0 +1,799 @@
1/* netfilter.c: look after the filters for various protocols.
2 * Heavily influenced by the old firewall.c by David Bonn and Alan Cox.
3 *
4 * Thanks to Rob `CmdrTaco' Malda for not influencing this code in any
5 * way.
6 *
7 * Rusty Russell (C)2000 -- This code is GPL.
8 *
9 * February 2000: Modified by James Morris to have 1 queue per protocol.
10 * 15-Mar-2000: Added NF_REPEAT --RR.
11 * 08-May-2003: Internal logging interface added by Jozsef Kadlecsik.
12 */
13#include <linux/config.h>
14#include <linux/kernel.h>
15#include <linux/netfilter.h>
16#include <net/protocol.h>
17#include <linux/init.h>
18#include <linux/skbuff.h>
19#include <linux/wait.h>
20#include <linux/module.h>
21#include <linux/interrupt.h>
22#include <linux/if.h>
23#include <linux/netdevice.h>
24#include <linux/inetdevice.h>
25#include <linux/tcp.h>
26#include <linux/udp.h>
27#include <linux/icmp.h>
28#include <net/sock.h>
29#include <net/route.h>
30#include <linux/ip.h>
31
32/* In this code, we can be waiting indefinitely for userspace to
33 * service a packet if a hook returns NF_QUEUE. We could keep a count
34 * of skbuffs queued for userspace, and not deregister a hook unless
35 * this is zero, but that sucks. Now, we simply check when the
36 * packets come back: if the hook is gone, the packet is discarded. */
37#ifdef CONFIG_NETFILTER_DEBUG
38#define NFDEBUG(format, args...) printk(format , ## args)
39#else
40#define NFDEBUG(format, args...)
41#endif
42
43/* Sockopts only registered and called from user context, so
44 net locking would be overkill. Also, [gs]etsockopt calls may
45 sleep. */
46static DECLARE_MUTEX(nf_sockopt_mutex);
47
48struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS];
49static LIST_HEAD(nf_sockopts);
50static DEFINE_SPINLOCK(nf_hook_lock);
51
52/*
53 * A queue handler may be registered for each protocol. Each is protected by
54 * long term mutex. The handler must provide an an outfn() to accept packets
55 * for queueing and must reinject all packets it receives, no matter what.
56 */
57static struct nf_queue_handler_t {
58 nf_queue_outfn_t outfn;
59 void *data;
60} queue_handler[NPROTO];
61static DEFINE_RWLOCK(queue_handler_lock);
62
63int nf_register_hook(struct nf_hook_ops *reg)
64{
65 struct list_head *i;
66
67 spin_lock_bh(&nf_hook_lock);
68 list_for_each(i, &nf_hooks[reg->pf][reg->hooknum]) {
69 if (reg->priority < ((struct nf_hook_ops *)i)->priority)
70 break;
71 }
72 list_add_rcu(&reg->list, i->prev);
73 spin_unlock_bh(&nf_hook_lock);
74
75 synchronize_net();
76 return 0;
77}
78
79void nf_unregister_hook(struct nf_hook_ops *reg)
80{
81 spin_lock_bh(&nf_hook_lock);
82 list_del_rcu(&reg->list);
83 spin_unlock_bh(&nf_hook_lock);
84
85 synchronize_net();
86}
87
88/* Do exclusive ranges overlap? */
89static inline int overlap(int min1, int max1, int min2, int max2)
90{
91 return max1 > min2 && min1 < max2;
92}
93
94/* Functions to register sockopt ranges (exclusive). */
95int nf_register_sockopt(struct nf_sockopt_ops *reg)
96{
97 struct list_head *i;
98 int ret = 0;
99
100 if (down_interruptible(&nf_sockopt_mutex) != 0)
101 return -EINTR;
102
103 list_for_each(i, &nf_sockopts) {
104 struct nf_sockopt_ops *ops = (struct nf_sockopt_ops *)i;
105 if (ops->pf == reg->pf
106 && (overlap(ops->set_optmin, ops->set_optmax,
107 reg->set_optmin, reg->set_optmax)
108 || overlap(ops->get_optmin, ops->get_optmax,
109 reg->get_optmin, reg->get_optmax))) {
110 NFDEBUG("nf_sock overlap: %u-%u/%u-%u v %u-%u/%u-%u\n",
111 ops->set_optmin, ops->set_optmax,
112 ops->get_optmin, ops->get_optmax,
113 reg->set_optmin, reg->set_optmax,
114 reg->get_optmin, reg->get_optmax);
115 ret = -EBUSY;
116 goto out;
117 }
118 }
119
120 list_add(&reg->list, &nf_sockopts);
121out:
122 up(&nf_sockopt_mutex);
123 return ret;
124}
125
126void nf_unregister_sockopt(struct nf_sockopt_ops *reg)
127{
128 /* No point being interruptible: we're probably in cleanup_module() */
129 restart:
130 down(&nf_sockopt_mutex);
131 if (reg->use != 0) {
132 /* To be woken by nf_sockopt call... */
133 /* FIXME: Stuart Young's name appears gratuitously. */
134 set_current_state(TASK_UNINTERRUPTIBLE);
135 reg->cleanup_task = current;
136 up(&nf_sockopt_mutex);
137 schedule();
138 goto restart;
139 }
140 list_del(&reg->list);
141 up(&nf_sockopt_mutex);
142}
143
144#ifdef CONFIG_NETFILTER_DEBUG
145#include <net/ip.h>
146#include <net/tcp.h>
147#include <linux/netfilter_ipv4.h>
148
149static void debug_print_hooks_ip(unsigned int nf_debug)
150{
151 if (nf_debug & (1 << NF_IP_PRE_ROUTING)) {
152 printk("PRE_ROUTING ");
153 nf_debug ^= (1 << NF_IP_PRE_ROUTING);
154 }
155 if (nf_debug & (1 << NF_IP_LOCAL_IN)) {
156 printk("LOCAL_IN ");
157 nf_debug ^= (1 << NF_IP_LOCAL_IN);
158 }
159 if (nf_debug & (1 << NF_IP_FORWARD)) {
160 printk("FORWARD ");
161 nf_debug ^= (1 << NF_IP_FORWARD);
162 }
163 if (nf_debug & (1 << NF_IP_LOCAL_OUT)) {
164 printk("LOCAL_OUT ");
165 nf_debug ^= (1 << NF_IP_LOCAL_OUT);
166 }
167 if (nf_debug & (1 << NF_IP_POST_ROUTING)) {
168 printk("POST_ROUTING ");
169 nf_debug ^= (1 << NF_IP_POST_ROUTING);
170 }
171 if (nf_debug)
172 printk("Crap bits: 0x%04X", nf_debug);
173 printk("\n");
174}
175
176static void nf_dump_skb(int pf, struct sk_buff *skb)
177{
178 printk("skb: pf=%i %s dev=%s len=%u\n",
179 pf,
180 skb->sk ? "(owned)" : "(unowned)",
181 skb->dev ? skb->dev->name : "(no dev)",
182 skb->len);
183 switch (pf) {
184 case PF_INET: {
185 const struct iphdr *ip = skb->nh.iph;
186 __u32 *opt = (__u32 *) (ip + 1);
187 int opti;
188 __u16 src_port = 0, dst_port = 0;
189
190 if (ip->protocol == IPPROTO_TCP
191 || ip->protocol == IPPROTO_UDP) {
192 struct tcphdr *tcp=(struct tcphdr *)((__u32 *)ip+ip->ihl);
193 src_port = ntohs(tcp->source);
194 dst_port = ntohs(tcp->dest);
195 }
196
197 printk("PROTO=%d %u.%u.%u.%u:%hu %u.%u.%u.%u:%hu"
198 " L=%hu S=0x%2.2hX I=%hu F=0x%4.4hX T=%hu",
199 ip->protocol, NIPQUAD(ip->saddr),
200 src_port, NIPQUAD(ip->daddr),
201 dst_port,
202 ntohs(ip->tot_len), ip->tos, ntohs(ip->id),
203 ntohs(ip->frag_off), ip->ttl);
204
205 for (opti = 0; opti < (ip->ihl - sizeof(struct iphdr) / 4); opti++)
206 printk(" O=0x%8.8X", *opt++);
207 printk("\n");
208 }
209 }
210}
211
212void nf_debug_ip_local_deliver(struct sk_buff *skb)
213{
214 /* If it's a loopback packet, it must have come through
215 * NF_IP_LOCAL_OUT, NF_IP_RAW_INPUT, NF_IP_PRE_ROUTING and
216 * NF_IP_LOCAL_IN. Otherwise, must have gone through
217 * NF_IP_RAW_INPUT and NF_IP_PRE_ROUTING. */
218 if (!skb->dev) {
219 printk("ip_local_deliver: skb->dev is NULL.\n");
220 }
221 else if (strcmp(skb->dev->name, "lo") == 0) {
222 if (skb->nf_debug != ((1 << NF_IP_LOCAL_OUT)
223 | (1 << NF_IP_POST_ROUTING)
224 | (1 << NF_IP_PRE_ROUTING)
225 | (1 << NF_IP_LOCAL_IN))) {
226 printk("ip_local_deliver: bad loopback skb: ");
227 debug_print_hooks_ip(skb->nf_debug);
228 nf_dump_skb(PF_INET, skb);
229 }
230 }
231 else {
232 if (skb->nf_debug != ((1<<NF_IP_PRE_ROUTING)
233 | (1<<NF_IP_LOCAL_IN))) {
234 printk("ip_local_deliver: bad non-lo skb: ");
235 debug_print_hooks_ip(skb->nf_debug);
236 nf_dump_skb(PF_INET, skb);
237 }
238 }
239}
240
241void nf_debug_ip_loopback_xmit(struct sk_buff *newskb)
242{
243 if (newskb->nf_debug != ((1 << NF_IP_LOCAL_OUT)
244 | (1 << NF_IP_POST_ROUTING))) {
245 printk("ip_dev_loopback_xmit: bad owned skb = %p: ",
246 newskb);
247 debug_print_hooks_ip(newskb->nf_debug);
248 nf_dump_skb(PF_INET, newskb);
249 }
250 /* Clear to avoid confusing input check */
251 newskb->nf_debug = 0;
252}
253
254void nf_debug_ip_finish_output2(struct sk_buff *skb)
255{
256 /* If it's owned, it must have gone through the
257 * NF_IP_LOCAL_OUT and NF_IP_POST_ROUTING.
258 * Otherwise, must have gone through
259 * NF_IP_PRE_ROUTING, NF_IP_FORWARD and NF_IP_POST_ROUTING.
260 */
261 if (skb->sk) {
262 if (skb->nf_debug != ((1 << NF_IP_LOCAL_OUT)
263 | (1 << NF_IP_POST_ROUTING))) {
264 printk("ip_finish_output: bad owned skb = %p: ", skb);
265 debug_print_hooks_ip(skb->nf_debug);
266 nf_dump_skb(PF_INET, skb);
267 }
268 } else {
269 if (skb->nf_debug != ((1 << NF_IP_PRE_ROUTING)
270 | (1 << NF_IP_FORWARD)
271 | (1 << NF_IP_POST_ROUTING))) {
272 /* Fragments, entunnelled packets, TCP RSTs
273 generated by ipt_REJECT will have no
274 owners, but still may be local */
275 if (skb->nf_debug != ((1 << NF_IP_LOCAL_OUT)
276 | (1 << NF_IP_POST_ROUTING))){
277 printk("ip_finish_output:"
278 " bad unowned skb = %p: ",skb);
279 debug_print_hooks_ip(skb->nf_debug);
280 nf_dump_skb(PF_INET, skb);
281 }
282 }
283 }
284}
285#endif /*CONFIG_NETFILTER_DEBUG*/
286
287/* Call get/setsockopt() */
288static int nf_sockopt(struct sock *sk, int pf, int val,
289 char __user *opt, int *len, int get)
290{
291 struct list_head *i;
292 struct nf_sockopt_ops *ops;
293 int ret;
294
295 if (down_interruptible(&nf_sockopt_mutex) != 0)
296 return -EINTR;
297
298 list_for_each(i, &nf_sockopts) {
299 ops = (struct nf_sockopt_ops *)i;
300 if (ops->pf == pf) {
301 if (get) {
302 if (val >= ops->get_optmin
303 && val < ops->get_optmax) {
304 ops->use++;
305 up(&nf_sockopt_mutex);
306 ret = ops->get(sk, val, opt, len);
307 goto out;
308 }
309 } else {
310 if (val >= ops->set_optmin
311 && val < ops->set_optmax) {
312 ops->use++;
313 up(&nf_sockopt_mutex);
314 ret = ops->set(sk, val, opt, *len);
315 goto out;
316 }
317 }
318 }
319 }
320 up(&nf_sockopt_mutex);
321 return -ENOPROTOOPT;
322
323 out:
324 down(&nf_sockopt_mutex);
325 ops->use--;
326 if (ops->cleanup_task)
327 wake_up_process(ops->cleanup_task);
328 up(&nf_sockopt_mutex);
329 return ret;
330}
331
332int nf_setsockopt(struct sock *sk, int pf, int val, char __user *opt,
333 int len)
334{
335 return nf_sockopt(sk, pf, val, opt, &len, 0);
336}
337
338int nf_getsockopt(struct sock *sk, int pf, int val, char __user *opt, int *len)
339{
340 return nf_sockopt(sk, pf, val, opt, len, 1);
341}
342
343static unsigned int nf_iterate(struct list_head *head,
344 struct sk_buff **skb,
345 int hook,
346 const struct net_device *indev,
347 const struct net_device *outdev,
348 struct list_head **i,
349 int (*okfn)(struct sk_buff *),
350 int hook_thresh)
351{
352 unsigned int verdict;
353
354 /*
355 * The caller must not block between calls to this
356 * function because of risk of continuing from deleted element.
357 */
358 list_for_each_continue_rcu(*i, head) {
359 struct nf_hook_ops *elem = (struct nf_hook_ops *)*i;
360
361 if (hook_thresh > elem->priority)
362 continue;
363
364 /* Optimization: we don't need to hold module
365 reference here, since function can't sleep. --RR */
366 verdict = elem->hook(hook, skb, indev, outdev, okfn);
367 if (verdict != NF_ACCEPT) {
368#ifdef CONFIG_NETFILTER_DEBUG
369 if (unlikely(verdict > NF_MAX_VERDICT)) {
370 NFDEBUG("Evil return from %p(%u).\n",
371 elem->hook, hook);
372 continue;
373 }
374#endif
375 if (verdict != NF_REPEAT)
376 return verdict;
377 *i = (*i)->prev;
378 }
379 }
380 return NF_ACCEPT;
381}
382
383int nf_register_queue_handler(int pf, nf_queue_outfn_t outfn, void *data)
384{
385 int ret;
386
387 write_lock_bh(&queue_handler_lock);
388 if (queue_handler[pf].outfn)
389 ret = -EBUSY;
390 else {
391 queue_handler[pf].outfn = outfn;
392 queue_handler[pf].data = data;
393 ret = 0;
394 }
395 write_unlock_bh(&queue_handler_lock);
396
397 return ret;
398}
399
400/* The caller must flush their queue before this */
401int nf_unregister_queue_handler(int pf)
402{
403 write_lock_bh(&queue_handler_lock);
404 queue_handler[pf].outfn = NULL;
405 queue_handler[pf].data = NULL;
406 write_unlock_bh(&queue_handler_lock);
407
408 return 0;
409}
410
411/*
412 * Any packet that leaves via this function must come back
413 * through nf_reinject().
414 */
415static int nf_queue(struct sk_buff *skb,
416 struct list_head *elem,
417 int pf, unsigned int hook,
418 struct net_device *indev,
419 struct net_device *outdev,
420 int (*okfn)(struct sk_buff *))
421{
422 int status;
423 struct nf_info *info;
424#ifdef CONFIG_BRIDGE_NETFILTER
425 struct net_device *physindev = NULL;
426 struct net_device *physoutdev = NULL;
427#endif
428
429 /* QUEUE == DROP if noone is waiting, to be safe. */
430 read_lock(&queue_handler_lock);
431 if (!queue_handler[pf].outfn) {
432 read_unlock(&queue_handler_lock);
433 kfree_skb(skb);
434 return 1;
435 }
436
437 info = kmalloc(sizeof(*info), GFP_ATOMIC);
438 if (!info) {
439 if (net_ratelimit())
440 printk(KERN_ERR "OOM queueing packet %p\n",
441 skb);
442 read_unlock(&queue_handler_lock);
443 kfree_skb(skb);
444 return 1;
445 }
446
447 *info = (struct nf_info) {
448 (struct nf_hook_ops *)elem, pf, hook, indev, outdev, okfn };
449
450 /* If it's going away, ignore hook. */
451 if (!try_module_get(info->elem->owner)) {
452 read_unlock(&queue_handler_lock);
453 kfree(info);
454 return 0;
455 }
456
457 /* Bump dev refs so they don't vanish while packet is out */
458 if (indev) dev_hold(indev);
459 if (outdev) dev_hold(outdev);
460
461#ifdef CONFIG_BRIDGE_NETFILTER
462 if (skb->nf_bridge) {
463 physindev = skb->nf_bridge->physindev;
464 if (physindev) dev_hold(physindev);
465 physoutdev = skb->nf_bridge->physoutdev;
466 if (physoutdev) dev_hold(physoutdev);
467 }
468#endif
469
470 status = queue_handler[pf].outfn(skb, info, queue_handler[pf].data);
471 read_unlock(&queue_handler_lock);
472
473 if (status < 0) {
474 /* James M doesn't say fuck enough. */
475 if (indev) dev_put(indev);
476 if (outdev) dev_put(outdev);
477#ifdef CONFIG_BRIDGE_NETFILTER
478 if (physindev) dev_put(physindev);
479 if (physoutdev) dev_put(physoutdev);
480#endif
481 module_put(info->elem->owner);
482 kfree(info);
483 kfree_skb(skb);
484 return 1;
485 }
486 return 1;
487}
488
489/* Returns 1 if okfn() needs to be executed by the caller,
490 * -EPERM for NF_DROP, 0 otherwise. */
491int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb,
492 struct net_device *indev,
493 struct net_device *outdev,
494 int (*okfn)(struct sk_buff *),
495 int hook_thresh)
496{
497 struct list_head *elem;
498 unsigned int verdict;
499 int ret = 0;
500
501 /* We may already have this, but read-locks nest anyway */
502 rcu_read_lock();
503
504#ifdef CONFIG_NETFILTER_DEBUG
505 if (unlikely((*pskb)->nf_debug & (1 << hook))) {
506 printk("nf_hook: hook %i already set.\n", hook);
507 nf_dump_skb(pf, *pskb);
508 }
509 (*pskb)->nf_debug |= (1 << hook);
510#endif
511
512 elem = &nf_hooks[pf][hook];
513next_hook:
514 verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev,
515 outdev, &elem, okfn, hook_thresh);
516 if (verdict == NF_ACCEPT || verdict == NF_STOP) {
517 ret = 1;
518 goto unlock;
519 } else if (verdict == NF_DROP) {
520 kfree_skb(*pskb);
521 ret = -EPERM;
522 } else if (verdict == NF_QUEUE) {
523 NFDEBUG("nf_hook: Verdict = QUEUE.\n");
524 if (!nf_queue(*pskb, elem, pf, hook, indev, outdev, okfn))
525 goto next_hook;
526 }
527unlock:
528 rcu_read_unlock();
529 return ret;
530}
531
532void nf_reinject(struct sk_buff *skb, struct nf_info *info,
533 unsigned int verdict)
534{
535 struct list_head *elem = &info->elem->list;
536 struct list_head *i;
537
538 rcu_read_lock();
539
540 /* Release those devices we held, or Alexey will kill me. */
541 if (info->indev) dev_put(info->indev);
542 if (info->outdev) dev_put(info->outdev);
543#ifdef CONFIG_BRIDGE_NETFILTER
544 if (skb->nf_bridge) {
545 if (skb->nf_bridge->physindev)
546 dev_put(skb->nf_bridge->physindev);
547 if (skb->nf_bridge->physoutdev)
548 dev_put(skb->nf_bridge->physoutdev);
549 }
550#endif
551
552 /* Drop reference to owner of hook which queued us. */
553 module_put(info->elem->owner);
554
555 list_for_each_rcu(i, &nf_hooks[info->pf][info->hook]) {
556 if (i == elem)
557 break;
558 }
559
560 if (elem == &nf_hooks[info->pf][info->hook]) {
561 /* The module which sent it to userspace is gone. */
562 NFDEBUG("%s: module disappeared, dropping packet.\n",
563 __FUNCTION__);
564 verdict = NF_DROP;
565 }
566
567 /* Continue traversal iff userspace said ok... */
568 if (verdict == NF_REPEAT) {
569 elem = elem->prev;
570 verdict = NF_ACCEPT;
571 }
572
573 if (verdict == NF_ACCEPT) {
574 next_hook:
575 verdict = nf_iterate(&nf_hooks[info->pf][info->hook],
576 &skb, info->hook,
577 info->indev, info->outdev, &elem,
578 info->okfn, INT_MIN);
579 }
580
581 switch (verdict) {
582 case NF_ACCEPT:
583 info->okfn(skb);
584 break;
585
586 case NF_QUEUE:
587 if (!nf_queue(skb, elem, info->pf, info->hook,
588 info->indev, info->outdev, info->okfn))
589 goto next_hook;
590 break;
591 }
592 rcu_read_unlock();
593
594 if (verdict == NF_DROP)
595 kfree_skb(skb);
596
597 kfree(info);
598 return;
599}
600
601#ifdef CONFIG_INET
602/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
603int ip_route_me_harder(struct sk_buff **pskb)
604{
605 struct iphdr *iph = (*pskb)->nh.iph;
606 struct rtable *rt;
607 struct flowi fl = {};
608 struct dst_entry *odst;
609 unsigned int hh_len;
610
611 /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause
612 * packets with foreign saddr to appear on the NF_IP_LOCAL_OUT hook.
613 */
614 if (inet_addr_type(iph->saddr) == RTN_LOCAL) {
615 fl.nl_u.ip4_u.daddr = iph->daddr;
616 fl.nl_u.ip4_u.saddr = iph->saddr;
617 fl.nl_u.ip4_u.tos = RT_TOS(iph->tos);
618 fl.oif = (*pskb)->sk ? (*pskb)->sk->sk_bound_dev_if : 0;
619#ifdef CONFIG_IP_ROUTE_FWMARK
620 fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark;
621#endif
622 fl.proto = iph->protocol;
623 if (ip_route_output_key(&rt, &fl) != 0)
624 return -1;
625
626 /* Drop old route. */
627 dst_release((*pskb)->dst);
628 (*pskb)->dst = &rt->u.dst;
629 } else {
630 /* non-local src, find valid iif to satisfy
631 * rp-filter when calling ip_route_input. */
632 fl.nl_u.ip4_u.daddr = iph->saddr;
633 if (ip_route_output_key(&rt, &fl) != 0)
634 return -1;
635
636 odst = (*pskb)->dst;
637 if (ip_route_input(*pskb, iph->daddr, iph->saddr,
638 RT_TOS(iph->tos), rt->u.dst.dev) != 0) {
639 dst_release(&rt->u.dst);
640 return -1;
641 }
642 dst_release(&rt->u.dst);
643 dst_release(odst);
644 }
645
646 if ((*pskb)->dst->error)
647 return -1;
648
649 /* Change in oif may mean change in hh_len. */
650 hh_len = (*pskb)->dst->dev->hard_header_len;
651 if (skb_headroom(*pskb) < hh_len) {
652 struct sk_buff *nskb;
653
654 nskb = skb_realloc_headroom(*pskb, hh_len);
655 if (!nskb)
656 return -1;
657 if ((*pskb)->sk)
658 skb_set_owner_w(nskb, (*pskb)->sk);
659 kfree_skb(*pskb);
660 *pskb = nskb;
661 }
662
663 return 0;
664}
665EXPORT_SYMBOL(ip_route_me_harder);
666
667int skb_ip_make_writable(struct sk_buff **pskb, unsigned int writable_len)
668{
669 struct sk_buff *nskb;
670
671 if (writable_len > (*pskb)->len)
672 return 0;
673
674 /* Not exclusive use of packet? Must copy. */
675 if (skb_shared(*pskb) || skb_cloned(*pskb))
676 goto copy_skb;
677
678 return pskb_may_pull(*pskb, writable_len);
679
680copy_skb:
681 nskb = skb_copy(*pskb, GFP_ATOMIC);
682 if (!nskb)
683 return 0;
684 BUG_ON(skb_is_nonlinear(nskb));
685
686 /* Rest of kernel will get very unhappy if we pass it a
687 suddenly-orphaned skbuff */
688 if ((*pskb)->sk)
689 skb_set_owner_w(nskb, (*pskb)->sk);
690 kfree_skb(*pskb);
691 *pskb = nskb;
692 return 1;
693}
694EXPORT_SYMBOL(skb_ip_make_writable);
695#endif /*CONFIG_INET*/
696
697/* Internal logging interface, which relies on the real
698 LOG target modules */
699
700#define NF_LOG_PREFIXLEN 128
701
702static nf_logfn *nf_logging[NPROTO]; /* = NULL */
703static int reported = 0;
704static DEFINE_SPINLOCK(nf_log_lock);
705
706int nf_log_register(int pf, nf_logfn *logfn)
707{
708 int ret = -EBUSY;
709
710 /* Any setup of logging members must be done before
711 * substituting pointer. */
712 spin_lock(&nf_log_lock);
713 if (!nf_logging[pf]) {
714 rcu_assign_pointer(nf_logging[pf], logfn);
715 ret = 0;
716 }
717 spin_unlock(&nf_log_lock);
718 return ret;
719}
720
721void nf_log_unregister(int pf, nf_logfn *logfn)
722{
723 spin_lock(&nf_log_lock);
724 if (nf_logging[pf] == logfn)
725 nf_logging[pf] = NULL;
726 spin_unlock(&nf_log_lock);
727
728 /* Give time to concurrent readers. */
729 synchronize_net();
730}
731
732void nf_log_packet(int pf,
733 unsigned int hooknum,
734 const struct sk_buff *skb,
735 const struct net_device *in,
736 const struct net_device *out,
737 const char *fmt, ...)
738{
739 va_list args;
740 char prefix[NF_LOG_PREFIXLEN];
741 nf_logfn *logfn;
742
743 rcu_read_lock();
744 logfn = rcu_dereference(nf_logging[pf]);
745 if (logfn) {
746 va_start(args, fmt);
747 vsnprintf(prefix, sizeof(prefix), fmt, args);
748 va_end(args);
749 /* We must read logging before nf_logfn[pf] */
750 logfn(hooknum, skb, in, out, prefix);
751 } else if (!reported) {
752 printk(KERN_WARNING "nf_log_packet: can\'t log yet, "
753 "no backend logging module loaded in!\n");
754 reported++;
755 }
756 rcu_read_unlock();
757}
758EXPORT_SYMBOL(nf_log_register);
759EXPORT_SYMBOL(nf_log_unregister);
760EXPORT_SYMBOL(nf_log_packet);
761
762/* This does not belong here, but locally generated errors need it if connection
763 tracking in use: without this, connection may not be in hash table, and hence
764 manufactured ICMP or RST packets will not be associated with it. */
765void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *);
766
767void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb)
768{
769 void (*attach)(struct sk_buff *, struct sk_buff *);
770
771 if (skb->nfct && (attach = ip_ct_attach) != NULL) {
772 mb(); /* Just to be sure: must be read before executing this */
773 attach(new, skb);
774 }
775}
776
777void __init netfilter_init(void)
778{
779 int i, h;
780
781 for (i = 0; i < NPROTO; i++) {
782 for (h = 0; h < NF_MAX_HOOKS; h++)
783 INIT_LIST_HEAD(&nf_hooks[i][h]);
784 }
785}
786
787EXPORT_SYMBOL(ip_ct_attach);
788EXPORT_SYMBOL(nf_ct_attach);
789EXPORT_SYMBOL(nf_getsockopt);
790EXPORT_SYMBOL(nf_hook_slow);
791EXPORT_SYMBOL(nf_hooks);
792EXPORT_SYMBOL(nf_register_hook);
793EXPORT_SYMBOL(nf_register_queue_handler);
794EXPORT_SYMBOL(nf_register_sockopt);
795EXPORT_SYMBOL(nf_reinject);
796EXPORT_SYMBOL(nf_setsockopt);
797EXPORT_SYMBOL(nf_unregister_hook);
798EXPORT_SYMBOL(nf_unregister_queue_handler);
799EXPORT_SYMBOL(nf_unregister_sockopt);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
new file mode 100644
index 000000000000..a119696d5521
--- /dev/null
+++ b/net/core/netpoll.c
@@ -0,0 +1,735 @@
1/*
2 * Common framework for low-level network console, dump, and debugger code
3 *
4 * Sep 8 2003 Matt Mackall <mpm@selenic.com>
5 *
6 * based on the netconsole code from:
7 *
8 * Copyright (C) 2001 Ingo Molnar <mingo@redhat.com>
9 * Copyright (C) 2002 Red Hat, Inc.
10 */
11
12#include <linux/smp_lock.h>
13#include <linux/netdevice.h>
14#include <linux/etherdevice.h>
15#include <linux/string.h>
16#include <linux/inetdevice.h>
17#include <linux/inet.h>
18#include <linux/interrupt.h>
19#include <linux/netpoll.h>
20#include <linux/sched.h>
21#include <linux/delay.h>
22#include <linux/rcupdate.h>
23#include <linux/workqueue.h>
24#include <net/tcp.h>
25#include <net/udp.h>
26#include <asm/unaligned.h>
27
28/*
29 * We maintain a small pool of fully-sized skbs, to make sure the
30 * message gets out even in extreme OOM situations.
31 */
32
33#define MAX_UDP_CHUNK 1460
34#define MAX_SKBS 32
35#define MAX_QUEUE_DEPTH (MAX_SKBS / 2)
36
37static DEFINE_SPINLOCK(skb_list_lock);
38static int nr_skbs;
39static struct sk_buff *skbs;
40
41static DEFINE_SPINLOCK(queue_lock);
42static int queue_depth;
43static struct sk_buff *queue_head, *queue_tail;
44
45static atomic_t trapped;
46
47#define NETPOLL_RX_ENABLED 1
48#define NETPOLL_RX_DROP 2
49
50#define MAX_SKB_SIZE \
51 (MAX_UDP_CHUNK + sizeof(struct udphdr) + \
52 sizeof(struct iphdr) + sizeof(struct ethhdr))
53
54static void zap_completion_queue(void);
55
56static void queue_process(void *p)
57{
58 unsigned long flags;
59 struct sk_buff *skb;
60
61 while (queue_head) {
62 spin_lock_irqsave(&queue_lock, flags);
63
64 skb = queue_head;
65 queue_head = skb->next;
66 if (skb == queue_tail)
67 queue_head = NULL;
68
69 queue_depth--;
70
71 spin_unlock_irqrestore(&queue_lock, flags);
72
73 dev_queue_xmit(skb);
74 }
75}
76
77static DECLARE_WORK(send_queue, queue_process, NULL);
78
79void netpoll_queue(struct sk_buff *skb)
80{
81 unsigned long flags;
82
83 if (queue_depth == MAX_QUEUE_DEPTH) {
84 __kfree_skb(skb);
85 return;
86 }
87
88 spin_lock_irqsave(&queue_lock, flags);
89 if (!queue_head)
90 queue_head = skb;
91 else
92 queue_tail->next = skb;
93 queue_tail = skb;
94 queue_depth++;
95 spin_unlock_irqrestore(&queue_lock, flags);
96
97 schedule_work(&send_queue);
98}
99
100static int checksum_udp(struct sk_buff *skb, struct udphdr *uh,
101 unsigned short ulen, u32 saddr, u32 daddr)
102{
103 if (uh->check == 0)
104 return 0;
105
106 if (skb->ip_summed == CHECKSUM_HW)
107 return csum_tcpudp_magic(
108 saddr, daddr, ulen, IPPROTO_UDP, skb->csum);
109
110 skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
111
112 return csum_fold(skb_checksum(skb, 0, skb->len, skb->csum));
113}
114
115/*
116 * Check whether delayed processing was scheduled for our NIC. If so,
117 * we attempt to grab the poll lock and use ->poll() to pump the card.
118 * If this fails, either we've recursed in ->poll() or it's already
119 * running on another CPU.
120 *
121 * Note: we don't mask interrupts with this lock because we're using
122 * trylock here and interrupts are already disabled in the softirq
123 * case. Further, we test the poll_owner to avoid recursion on UP
124 * systems where the lock doesn't exist.
125 *
126 * In cases where there is bi-directional communications, reading only
127 * one message at a time can lead to packets being dropped by the
128 * network adapter, forcing superfluous retries and possibly timeouts.
129 * Thus, we set our budget to greater than 1.
130 */
131static void poll_napi(struct netpoll *np)
132{
133 int budget = 16;
134
135 if (test_bit(__LINK_STATE_RX_SCHED, &np->dev->state) &&
136 np->poll_owner != smp_processor_id() &&
137 spin_trylock(&np->poll_lock)) {
138 np->rx_flags |= NETPOLL_RX_DROP;
139 atomic_inc(&trapped);
140
141 np->dev->poll(np->dev, &budget);
142
143 atomic_dec(&trapped);
144 np->rx_flags &= ~NETPOLL_RX_DROP;
145 spin_unlock(&np->poll_lock);
146 }
147}
148
149void netpoll_poll(struct netpoll *np)
150{
151 if(!np->dev || !netif_running(np->dev) || !np->dev->poll_controller)
152 return;
153
154 /* Process pending work on NIC */
155 np->dev->poll_controller(np->dev);
156 if (np->dev->poll)
157 poll_napi(np);
158
159 zap_completion_queue();
160}
161
162static void refill_skbs(void)
163{
164 struct sk_buff *skb;
165 unsigned long flags;
166
167 spin_lock_irqsave(&skb_list_lock, flags);
168 while (nr_skbs < MAX_SKBS) {
169 skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC);
170 if (!skb)
171 break;
172
173 skb->next = skbs;
174 skbs = skb;
175 nr_skbs++;
176 }
177 spin_unlock_irqrestore(&skb_list_lock, flags);
178}
179
180static void zap_completion_queue(void)
181{
182 unsigned long flags;
183 struct softnet_data *sd = &get_cpu_var(softnet_data);
184
185 if (sd->completion_queue) {
186 struct sk_buff *clist;
187
188 local_irq_save(flags);
189 clist = sd->completion_queue;
190 sd->completion_queue = NULL;
191 local_irq_restore(flags);
192
193 while (clist != NULL) {
194 struct sk_buff *skb = clist;
195 clist = clist->next;
196 if(skb->destructor)
197 dev_kfree_skb_any(skb); /* put this one back */
198 else
199 __kfree_skb(skb);
200 }
201 }
202
203 put_cpu_var(softnet_data);
204}
205
206static struct sk_buff * find_skb(struct netpoll *np, int len, int reserve)
207{
208 int once = 1, count = 0;
209 unsigned long flags;
210 struct sk_buff *skb = NULL;
211
212 zap_completion_queue();
213repeat:
214 if (nr_skbs < MAX_SKBS)
215 refill_skbs();
216
217 skb = alloc_skb(len, GFP_ATOMIC);
218
219 if (!skb) {
220 spin_lock_irqsave(&skb_list_lock, flags);
221 skb = skbs;
222 if (skb) {
223 skbs = skb->next;
224 skb->next = NULL;
225 nr_skbs--;
226 }
227 spin_unlock_irqrestore(&skb_list_lock, flags);
228 }
229
230 if(!skb) {
231 count++;
232 if (once && (count == 1000000)) {
233 printk("out of netpoll skbs!\n");
234 once = 0;
235 }
236 netpoll_poll(np);
237 goto repeat;
238 }
239
240 atomic_set(&skb->users, 1);
241 skb_reserve(skb, reserve);
242 return skb;
243}
244
245static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
246{
247 int status;
248
249repeat:
250 if(!np || !np->dev || !netif_running(np->dev)) {
251 __kfree_skb(skb);
252 return;
253 }
254
255 /* avoid recursion */
256 if(np->poll_owner == smp_processor_id() ||
257 np->dev->xmit_lock_owner == smp_processor_id()) {
258 if (np->drop)
259 np->drop(skb);
260 else
261 __kfree_skb(skb);
262 return;
263 }
264
265 spin_lock(&np->dev->xmit_lock);
266 np->dev->xmit_lock_owner = smp_processor_id();
267
268 /*
269 * network drivers do not expect to be called if the queue is
270 * stopped.
271 */
272 if (netif_queue_stopped(np->dev)) {
273 np->dev->xmit_lock_owner = -1;
274 spin_unlock(&np->dev->xmit_lock);
275
276 netpoll_poll(np);
277 goto repeat;
278 }
279
280 status = np->dev->hard_start_xmit(skb, np->dev);
281 np->dev->xmit_lock_owner = -1;
282 spin_unlock(&np->dev->xmit_lock);
283
284 /* transmit busy */
285 if(status) {
286 netpoll_poll(np);
287 goto repeat;
288 }
289}
290
291void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
292{
293 int total_len, eth_len, ip_len, udp_len;
294 struct sk_buff *skb;
295 struct udphdr *udph;
296 struct iphdr *iph;
297 struct ethhdr *eth;
298
299 udp_len = len + sizeof(*udph);
300 ip_len = eth_len = udp_len + sizeof(*iph);
301 total_len = eth_len + ETH_HLEN + NET_IP_ALIGN;
302
303 skb = find_skb(np, total_len, total_len - len);
304 if (!skb)
305 return;
306
307 memcpy(skb->data, msg, len);
308 skb->len += len;
309
310 udph = (struct udphdr *) skb_push(skb, sizeof(*udph));
311 udph->source = htons(np->local_port);
312 udph->dest = htons(np->remote_port);
313 udph->len = htons(udp_len);
314 udph->check = 0;
315
316 iph = (struct iphdr *)skb_push(skb, sizeof(*iph));
317
318 /* iph->version = 4; iph->ihl = 5; */
319 put_unaligned(0x45, (unsigned char *)iph);
320 iph->tos = 0;
321 put_unaligned(htons(ip_len), &(iph->tot_len));
322 iph->id = 0;
323 iph->frag_off = 0;
324 iph->ttl = 64;
325 iph->protocol = IPPROTO_UDP;
326 iph->check = 0;
327 put_unaligned(htonl(np->local_ip), &(iph->saddr));
328 put_unaligned(htonl(np->remote_ip), &(iph->daddr));
329 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
330
331 eth = (struct ethhdr *) skb_push(skb, ETH_HLEN);
332
333 eth->h_proto = htons(ETH_P_IP);
334 memcpy(eth->h_source, np->local_mac, 6);
335 memcpy(eth->h_dest, np->remote_mac, 6);
336
337 skb->dev = np->dev;
338
339 netpoll_send_skb(np, skb);
340}
341
342static void arp_reply(struct sk_buff *skb)
343{
344 struct arphdr *arp;
345 unsigned char *arp_ptr;
346 int size, type = ARPOP_REPLY, ptype = ETH_P_ARP;
347 u32 sip, tip;
348 struct sk_buff *send_skb;
349 struct netpoll *np = skb->dev->np;
350
351 if (!np) return;
352
353 /* No arp on this interface */
354 if (skb->dev->flags & IFF_NOARP)
355 return;
356
357 if (!pskb_may_pull(skb, (sizeof(struct arphdr) +
358 (2 * skb->dev->addr_len) +
359 (2 * sizeof(u32)))))
360 return;
361
362 skb->h.raw = skb->nh.raw = skb->data;
363 arp = skb->nh.arph;
364
365 if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&
366 arp->ar_hrd != htons(ARPHRD_IEEE802)) ||
367 arp->ar_pro != htons(ETH_P_IP) ||
368 arp->ar_op != htons(ARPOP_REQUEST))
369 return;
370
371 arp_ptr = (unsigned char *)(arp+1) + skb->dev->addr_len;
372 memcpy(&sip, arp_ptr, 4);
373 arp_ptr += 4 + skb->dev->addr_len;
374 memcpy(&tip, arp_ptr, 4);
375
376 /* Should we ignore arp? */
377 if (tip != htonl(np->local_ip) || LOOPBACK(tip) || MULTICAST(tip))
378 return;
379
380 size = sizeof(struct arphdr) + 2 * (skb->dev->addr_len + 4);
381 send_skb = find_skb(np, size + LL_RESERVED_SPACE(np->dev),
382 LL_RESERVED_SPACE(np->dev));
383
384 if (!send_skb)
385 return;
386
387 send_skb->nh.raw = send_skb->data;
388 arp = (struct arphdr *) skb_put(send_skb, size);
389 send_skb->dev = skb->dev;
390 send_skb->protocol = htons(ETH_P_ARP);
391
392 /* Fill the device header for the ARP frame */
393
394 if (np->dev->hard_header &&
395 np->dev->hard_header(send_skb, skb->dev, ptype,
396 np->remote_mac, np->local_mac,
397 send_skb->len) < 0) {
398 kfree_skb(send_skb);
399 return;
400 }
401
402 /*
403 * Fill out the arp protocol part.
404 *
405 * we only support ethernet device type,
406 * which (according to RFC 1390) should always equal 1 (Ethernet).
407 */
408
409 arp->ar_hrd = htons(np->dev->type);
410 arp->ar_pro = htons(ETH_P_IP);
411 arp->ar_hln = np->dev->addr_len;
412 arp->ar_pln = 4;
413 arp->ar_op = htons(type);
414
415 arp_ptr=(unsigned char *)(arp + 1);
416 memcpy(arp_ptr, np->dev->dev_addr, np->dev->addr_len);
417 arp_ptr += np->dev->addr_len;
418 memcpy(arp_ptr, &tip, 4);
419 arp_ptr += 4;
420 memcpy(arp_ptr, np->remote_mac, np->dev->addr_len);
421 arp_ptr += np->dev->addr_len;
422 memcpy(arp_ptr, &sip, 4);
423
424 netpoll_send_skb(np, send_skb);
425}
426
427int __netpoll_rx(struct sk_buff *skb)
428{
429 int proto, len, ulen;
430 struct iphdr *iph;
431 struct udphdr *uh;
432 struct netpoll *np = skb->dev->np;
433
434 if (!np->rx_hook)
435 goto out;
436 if (skb->dev->type != ARPHRD_ETHER)
437 goto out;
438
439 /* check if netpoll clients need ARP */
440 if (skb->protocol == __constant_htons(ETH_P_ARP) &&
441 atomic_read(&trapped)) {
442 arp_reply(skb);
443 return 1;
444 }
445
446 proto = ntohs(eth_hdr(skb)->h_proto);
447 if (proto != ETH_P_IP)
448 goto out;
449 if (skb->pkt_type == PACKET_OTHERHOST)
450 goto out;
451 if (skb_shared(skb))
452 goto out;
453
454 iph = (struct iphdr *)skb->data;
455 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
456 goto out;
457 if (iph->ihl < 5 || iph->version != 4)
458 goto out;
459 if (!pskb_may_pull(skb, iph->ihl*4))
460 goto out;
461 if (ip_fast_csum((u8 *)iph, iph->ihl) != 0)
462 goto out;
463
464 len = ntohs(iph->tot_len);
465 if (skb->len < len || len < iph->ihl*4)
466 goto out;
467
468 if (iph->protocol != IPPROTO_UDP)
469 goto out;
470
471 len -= iph->ihl*4;
472 uh = (struct udphdr *)(((char *)iph) + iph->ihl*4);
473 ulen = ntohs(uh->len);
474
475 if (ulen != len)
476 goto out;
477 if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr) < 0)
478 goto out;
479 if (np->local_ip && np->local_ip != ntohl(iph->daddr))
480 goto out;
481 if (np->remote_ip && np->remote_ip != ntohl(iph->saddr))
482 goto out;
483 if (np->local_port && np->local_port != ntohs(uh->dest))
484 goto out;
485
486 np->rx_hook(np, ntohs(uh->source),
487 (char *)(uh+1),
488 ulen - sizeof(struct udphdr));
489
490 kfree_skb(skb);
491 return 1;
492
493out:
494 if (atomic_read(&trapped)) {
495 kfree_skb(skb);
496 return 1;
497 }
498
499 return 0;
500}
501
502int netpoll_parse_options(struct netpoll *np, char *opt)
503{
504 char *cur=opt, *delim;
505
506 if(*cur != '@') {
507 if ((delim = strchr(cur, '@')) == NULL)
508 goto parse_failed;
509 *delim=0;
510 np->local_port=simple_strtol(cur, NULL, 10);
511 cur=delim;
512 }
513 cur++;
514 printk(KERN_INFO "%s: local port %d\n", np->name, np->local_port);
515
516 if(*cur != '/') {
517 if ((delim = strchr(cur, '/')) == NULL)
518 goto parse_failed;
519 *delim=0;
520 np->local_ip=ntohl(in_aton(cur));
521 cur=delim;
522
523 printk(KERN_INFO "%s: local IP %d.%d.%d.%d\n",
524 np->name, HIPQUAD(np->local_ip));
525 }
526 cur++;
527
528 if ( *cur != ',') {
529 /* parse out dev name */
530 if ((delim = strchr(cur, ',')) == NULL)
531 goto parse_failed;
532 *delim=0;
533 strlcpy(np->dev_name, cur, sizeof(np->dev_name));
534 cur=delim;
535 }
536 cur++;
537
538 printk(KERN_INFO "%s: interface %s\n", np->name, np->dev_name);
539
540 if ( *cur != '@' ) {
541 /* dst port */
542 if ((delim = strchr(cur, '@')) == NULL)
543 goto parse_failed;
544 *delim=0;
545 np->remote_port=simple_strtol(cur, NULL, 10);
546 cur=delim;
547 }
548 cur++;
549 printk(KERN_INFO "%s: remote port %d\n", np->name, np->remote_port);
550
551 /* dst ip */
552 if ((delim = strchr(cur, '/')) == NULL)
553 goto parse_failed;
554 *delim=0;
555 np->remote_ip=ntohl(in_aton(cur));
556 cur=delim+1;
557
558 printk(KERN_INFO "%s: remote IP %d.%d.%d.%d\n",
559 np->name, HIPQUAD(np->remote_ip));
560
561 if( *cur != 0 )
562 {
563 /* MAC address */
564 if ((delim = strchr(cur, ':')) == NULL)
565 goto parse_failed;
566 *delim=0;
567 np->remote_mac[0]=simple_strtol(cur, NULL, 16);
568 cur=delim+1;
569 if ((delim = strchr(cur, ':')) == NULL)
570 goto parse_failed;
571 *delim=0;
572 np->remote_mac[1]=simple_strtol(cur, NULL, 16);
573 cur=delim+1;
574 if ((delim = strchr(cur, ':')) == NULL)
575 goto parse_failed;
576 *delim=0;
577 np->remote_mac[2]=simple_strtol(cur, NULL, 16);
578 cur=delim+1;
579 if ((delim = strchr(cur, ':')) == NULL)
580 goto parse_failed;
581 *delim=0;
582 np->remote_mac[3]=simple_strtol(cur, NULL, 16);
583 cur=delim+1;
584 if ((delim = strchr(cur, ':')) == NULL)
585 goto parse_failed;
586 *delim=0;
587 np->remote_mac[4]=simple_strtol(cur, NULL, 16);
588 cur=delim+1;
589 np->remote_mac[5]=simple_strtol(cur, NULL, 16);
590 }
591
592 printk(KERN_INFO "%s: remote ethernet address "
593 "%02x:%02x:%02x:%02x:%02x:%02x\n",
594 np->name,
595 np->remote_mac[0],
596 np->remote_mac[1],
597 np->remote_mac[2],
598 np->remote_mac[3],
599 np->remote_mac[4],
600 np->remote_mac[5]);
601
602 return 0;
603
604 parse_failed:
605 printk(KERN_INFO "%s: couldn't parse config at %s!\n",
606 np->name, cur);
607 return -1;
608}
609
610int netpoll_setup(struct netpoll *np)
611{
612 struct net_device *ndev = NULL;
613 struct in_device *in_dev;
614
615 np->poll_lock = SPIN_LOCK_UNLOCKED;
616 np->poll_owner = -1;
617
618 if (np->dev_name)
619 ndev = dev_get_by_name(np->dev_name);
620 if (!ndev) {
621 printk(KERN_ERR "%s: %s doesn't exist, aborting.\n",
622 np->name, np->dev_name);
623 return -1;
624 }
625
626 np->dev = ndev;
627 ndev->np = np;
628
629 if (!ndev->poll_controller) {
630 printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n",
631 np->name, np->dev_name);
632 goto release;
633 }
634
635 if (!netif_running(ndev)) {
636 unsigned long atmost, atleast;
637
638 printk(KERN_INFO "%s: device %s not up yet, forcing it\n",
639 np->name, np->dev_name);
640
641 rtnl_shlock();
642 if (dev_change_flags(ndev, ndev->flags | IFF_UP) < 0) {
643 printk(KERN_ERR "%s: failed to open %s\n",
644 np->name, np->dev_name);
645 rtnl_shunlock();
646 goto release;
647 }
648 rtnl_shunlock();
649
650 atleast = jiffies + HZ/10;
651 atmost = jiffies + 4*HZ;
652 while (!netif_carrier_ok(ndev)) {
653 if (time_after(jiffies, atmost)) {
654 printk(KERN_NOTICE
655 "%s: timeout waiting for carrier\n",
656 np->name);
657 break;
658 }
659 cond_resched();
660 }
661
662 /* If carrier appears to come up instantly, we don't
663 * trust it and pause so that we don't pump all our
664 * queued console messages into the bitbucket.
665 */
666
667 if (time_before(jiffies, atleast)) {
668 printk(KERN_NOTICE "%s: carrier detect appears"
669 " untrustworthy, waiting 4 seconds\n",
670 np->name);
671 msleep(4000);
672 }
673 }
674
675 if (!memcmp(np->local_mac, "\0\0\0\0\0\0", 6) && ndev->dev_addr)
676 memcpy(np->local_mac, ndev->dev_addr, 6);
677
678 if (!np->local_ip) {
679 rcu_read_lock();
680 in_dev = __in_dev_get(ndev);
681
682 if (!in_dev || !in_dev->ifa_list) {
683 rcu_read_unlock();
684 printk(KERN_ERR "%s: no IP address for %s, aborting\n",
685 np->name, np->dev_name);
686 goto release;
687 }
688
689 np->local_ip = ntohl(in_dev->ifa_list->ifa_local);
690 rcu_read_unlock();
691 printk(KERN_INFO "%s: local IP %d.%d.%d.%d\n",
692 np->name, HIPQUAD(np->local_ip));
693 }
694
695 if(np->rx_hook)
696 np->rx_flags = NETPOLL_RX_ENABLED;
697
698 return 0;
699
700 release:
701 ndev->np = NULL;
702 np->dev = NULL;
703 dev_put(ndev);
704 return -1;
705}
706
707void netpoll_cleanup(struct netpoll *np)
708{
709 if (np->dev)
710 np->dev->np = NULL;
711 dev_put(np->dev);
712 np->dev = NULL;
713}
714
715int netpoll_trap(void)
716{
717 return atomic_read(&trapped);
718}
719
720void netpoll_set_trap(int trap)
721{
722 if (trap)
723 atomic_inc(&trapped);
724 else
725 atomic_dec(&trapped);
726}
727
728EXPORT_SYMBOL(netpoll_set_trap);
729EXPORT_SYMBOL(netpoll_trap);
730EXPORT_SYMBOL(netpoll_parse_options);
731EXPORT_SYMBOL(netpoll_setup);
732EXPORT_SYMBOL(netpoll_cleanup);
733EXPORT_SYMBOL(netpoll_send_udp);
734EXPORT_SYMBOL(netpoll_poll);
735EXPORT_SYMBOL(netpoll_queue);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
new file mode 100644
index 000000000000..c57b06bc79f3
--- /dev/null
+++ b/net/core/pktgen.c
@@ -0,0 +1,3132 @@
1/*
2 * Authors:
3 * Copyright 2001, 2002 by Robert Olsson <robert.olsson@its.uu.se>
4 * Uppsala University and
5 * Swedish University of Agricultural Sciences
6 *
7 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
8 * Ben Greear <greearb@candelatech.com>
9 * Jens Låås <jens.laas@data.slu.se>
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License
13 * as published by the Free Software Foundation; either version
14 * 2 of the License, or (at your option) any later version.
15 *
16 *
17 * A tool for loading the network with preconfigurated packets.
18 * The tool is implemented as a linux module. Parameters are output
19 * device, delay (to hard_xmit), number of packets, and whether
20 * to use multiple SKBs or just the same one.
21 * pktgen uses the installed interface's output routine.
22 *
23 * Additional hacking by:
24 *
25 * Jens.Laas@data.slu.se
26 * Improved by ANK. 010120.
27 * Improved by ANK even more. 010212.
28 * MAC address typo fixed. 010417 --ro
29 * Integrated. 020301 --DaveM
30 * Added multiskb option 020301 --DaveM
31 * Scaling of results. 020417--sigurdur@linpro.no
32 * Significant re-work of the module:
33 * * Convert to threaded model to more efficiently be able to transmit
34 * and receive on multiple interfaces at once.
35 * * Converted many counters to __u64 to allow longer runs.
36 * * Allow configuration of ranges, like min/max IP address, MACs,
37 * and UDP-ports, for both source and destination, and can
38 * set to use a random distribution or sequentially walk the range.
39 * * Can now change most values after starting.
40 * * Place 12-byte packet in UDP payload with magic number,
41 * sequence number, and timestamp.
42 * * Add receiver code that detects dropped pkts, re-ordered pkts, and
43 * latencies (with micro-second) precision.
44 * * Add IOCTL interface to easily get counters & configuration.
45 * --Ben Greear <greearb@candelatech.com>
46 *
47 * Renamed multiskb to clone_skb and cleaned up sending core for two distinct
48 * skb modes. A clone_skb=0 mode for Ben "ranges" work and a clone_skb != 0
49 * as a "fastpath" with a configurable number of clones after alloc's.
50 * clone_skb=0 means all packets are allocated this also means ranges time
51 * stamps etc can be used. clone_skb=100 means 1 malloc is followed by 100
52 * clones.
53 *
54 * Also moved to /proc/net/pktgen/
55 * --ro
56 *
57 * Sept 10: Fixed threading/locking. Lots of bone-headed and more clever
58 * mistakes. Also merged in DaveM's patch in the -pre6 patch.
59 * --Ben Greear <greearb@candelatech.com>
60 *
61 * Integrated to 2.5.x 021029 --Lucio Maciel (luciomaciel@zipmail.com.br)
62 *
63 *
64 * 021124 Finished major redesign and rewrite for new functionality.
65 * See Documentation/networking/pktgen.txt for how to use this.
66 *
67 * The new operation:
68 * For each CPU one thread/process is created at start. This process checks
69 * for running devices in the if_list and sends packets until count is 0 it
70 * also the thread checks the thread->control which is used for inter-process
71 * communication. controlling process "posts" operations to the threads this
72 * way. The if_lock should be possible to remove when add/rem_device is merged
73 * into this too.
74 *
75 * By design there should only be *one* "controlling" process. In practice
76 * multiple write accesses gives unpredictable result. Understood by "write"
77 * to /proc gives result code thats should be read be the "writer".
78 * For pratical use this should be no problem.
79 *
80 * Note when adding devices to a specific CPU there good idea to also assign
81 * /proc/irq/XX/smp_affinity so TX-interrupts gets bound to the same CPU.
82 * --ro
83 *
84 * Fix refcount off by one if first packet fails, potential null deref,
85 * memleak 030710- KJP
86 *
87 * First "ranges" functionality for ipv6 030726 --ro
88 *
89 * Included flow support. 030802 ANK.
90 *
91 * Fixed unaligned access on IA-64 Grant Grundler <grundler@parisc-linux.org>
92 *
93 * Remove if fix from added Harald Welte <laforge@netfilter.org> 040419
94 * ia64 compilation fix from Aron Griffis <aron@hp.com> 040604
95 *
96 * New xmit() return, do_div and misc clean up by Stephen Hemminger
97 * <shemminger@osdl.org> 040923
98 *
99 * Rany Dunlap fixed u64 printk compiler waring
100 *
101 * Remove FCS from BW calculation. Lennert Buytenhek <buytenh@wantstofly.org>
102 * New time handling. Lennert Buytenhek <buytenh@wantstofly.org> 041213
103 *
104 * Corrections from Nikolai Malykh (nmalykh@bilim.com)
105 * Removed unused flags F_SET_SRCMAC & F_SET_SRCIP 041230
106 *
107 * interruptible_sleep_on_timeout() replaced Nishanth Aravamudan <nacc@us.ibm.com>
108 * 050103
109 */
110#include <linux/sys.h>
111#include <linux/types.h>
112#include <linux/module.h>
113#include <linux/moduleparam.h>
114#include <linux/kernel.h>
115#include <linux/smp_lock.h>
116#include <linux/sched.h>
117#include <linux/slab.h>
118#include <linux/vmalloc.h>
119#include <linux/sched.h>
120#include <linux/unistd.h>
121#include <linux/string.h>
122#include <linux/ptrace.h>
123#include <linux/errno.h>
124#include <linux/ioport.h>
125#include <linux/interrupt.h>
126#include <linux/delay.h>
127#include <linux/timer.h>
128#include <linux/init.h>
129#include <linux/skbuff.h>
130#include <linux/netdevice.h>
131#include <linux/inet.h>
132#include <linux/inetdevice.h>
133#include <linux/rtnetlink.h>
134#include <linux/if_arp.h>
135#include <linux/in.h>
136#include <linux/ip.h>
137#include <linux/ipv6.h>
138#include <linux/udp.h>
139#include <linux/proc_fs.h>
140#include <linux/wait.h>
141#include <net/checksum.h>
142#include <net/ipv6.h>
143#include <net/addrconf.h>
144#include <asm/byteorder.h>
145#include <linux/rcupdate.h>
146#include <asm/bitops.h>
147#include <asm/io.h>
148#include <asm/dma.h>
149#include <asm/uaccess.h>
150#include <asm/div64.h> /* do_div */
151#include <asm/timex.h>
152
153
154#define VERSION "pktgen v2.61: Packet Generator for packet performance testing.\n"
155
156/* #define PG_DEBUG(a) a */
157#define PG_DEBUG(a)
158
159/* The buckets are exponential in 'width' */
160#define LAT_BUCKETS_MAX 32
161#define IP_NAME_SZ 32
162
163/* Device flag bits */
164#define F_IPSRC_RND (1<<0) /* IP-Src Random */
165#define F_IPDST_RND (1<<1) /* IP-Dst Random */
166#define F_UDPSRC_RND (1<<2) /* UDP-Src Random */
167#define F_UDPDST_RND (1<<3) /* UDP-Dst Random */
168#define F_MACSRC_RND (1<<4) /* MAC-Src Random */
169#define F_MACDST_RND (1<<5) /* MAC-Dst Random */
170#define F_TXSIZE_RND (1<<6) /* Transmit size is random */
171#define F_IPV6 (1<<7) /* Interface in IPV6 Mode */
172
173/* Thread control flag bits */
174#define T_TERMINATE (1<<0)
175#define T_STOP (1<<1) /* Stop run */
176#define T_RUN (1<<2) /* Start run */
177#define T_REMDEV (1<<3) /* Remove all devs */
178
179/* Locks */
180#define thread_lock() spin_lock(&_thread_lock)
181#define thread_unlock() spin_unlock(&_thread_lock)
182
183/* If lock -- can be removed after some work */
184#define if_lock(t) spin_lock(&(t->if_lock));
185#define if_unlock(t) spin_unlock(&(t->if_lock));
186
187/* Used to help with determining the pkts on receive */
188#define PKTGEN_MAGIC 0xbe9be955
189#define PG_PROC_DIR "pktgen"
190
191#define MAX_CFLOWS 65536
192
193struct flow_state
194{
195 __u32 cur_daddr;
196 int count;
197};
198
199struct pktgen_dev {
200
201 /*
202 * Try to keep frequent/infrequent used vars. separated.
203 */
204
205 char ifname[32];
206 struct proc_dir_entry *proc_ent;
207 char result[512];
208 /* proc file names */
209 char fname[80];
210
211 struct pktgen_thread* pg_thread; /* the owner */
212 struct pktgen_dev *next; /* Used for chaining in the thread's run-queue */
213
214 int running; /* if this changes to false, the test will stop */
215
216 /* If min != max, then we will either do a linear iteration, or
217 * we will do a random selection from within the range.
218 */
219 __u32 flags;
220
221 int min_pkt_size; /* = ETH_ZLEN; */
222 int max_pkt_size; /* = ETH_ZLEN; */
223 int nfrags;
224 __u32 delay_us; /* Default delay */
225 __u32 delay_ns;
226 __u64 count; /* Default No packets to send */
227 __u64 sofar; /* How many pkts we've sent so far */
228 __u64 tx_bytes; /* How many bytes we've transmitted */
229 __u64 errors; /* Errors when trying to transmit, pkts will be re-sent */
230
231 /* runtime counters relating to clone_skb */
232 __u64 next_tx_us; /* timestamp of when to tx next */
233 __u32 next_tx_ns;
234
235 __u64 allocated_skbs;
236 __u32 clone_count;
237 int last_ok; /* Was last skb sent?
238 * Or a failed transmit of some sort? This will keep
239 * sequence numbers in order, for example.
240 */
241 __u64 started_at; /* micro-seconds */
242 __u64 stopped_at; /* micro-seconds */
243 __u64 idle_acc; /* micro-seconds */
244 __u32 seq_num;
245
246 int clone_skb; /* Use multiple SKBs during packet gen. If this number
247 * is greater than 1, then that many coppies of the same
248 * packet will be sent before a new packet is allocated.
249 * For instance, if you want to send 1024 identical packets
250 * before creating a new packet, set clone_skb to 1024.
251 */
252
253 char dst_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */
254 char dst_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */
255 char src_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */
256 char src_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */
257
258 struct in6_addr in6_saddr;
259 struct in6_addr in6_daddr;
260 struct in6_addr cur_in6_daddr;
261 struct in6_addr cur_in6_saddr;
262 /* For ranges */
263 struct in6_addr min_in6_daddr;
264 struct in6_addr max_in6_daddr;
265 struct in6_addr min_in6_saddr;
266 struct in6_addr max_in6_saddr;
267
268 /* If we're doing ranges, random or incremental, then this
269 * defines the min/max for those ranges.
270 */
271 __u32 saddr_min; /* inclusive, source IP address */
272 __u32 saddr_max; /* exclusive, source IP address */
273 __u32 daddr_min; /* inclusive, dest IP address */
274 __u32 daddr_max; /* exclusive, dest IP address */
275
276 __u16 udp_src_min; /* inclusive, source UDP port */
277 __u16 udp_src_max; /* exclusive, source UDP port */
278 __u16 udp_dst_min; /* inclusive, dest UDP port */
279 __u16 udp_dst_max; /* exclusive, dest UDP port */
280
281 __u32 src_mac_count; /* How many MACs to iterate through */
282 __u32 dst_mac_count; /* How many MACs to iterate through */
283
284 unsigned char dst_mac[6];
285 unsigned char src_mac[6];
286
287 __u32 cur_dst_mac_offset;
288 __u32 cur_src_mac_offset;
289 __u32 cur_saddr;
290 __u32 cur_daddr;
291 __u16 cur_udp_dst;
292 __u16 cur_udp_src;
293 __u32 cur_pkt_size;
294
295 __u8 hh[14];
296 /* = {
297 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB,
298
299 We fill in SRC address later
300 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
301 0x08, 0x00
302 };
303 */
304 __u16 pad; /* pad out the hh struct to an even 16 bytes */
305
306 struct sk_buff* skb; /* skb we are to transmit next, mainly used for when we
307 * are transmitting the same one multiple times
308 */
309 struct net_device* odev; /* The out-going device. Note that the device should
310 * have it's pg_info pointer pointing back to this
311 * device. This will be set when the user specifies
312 * the out-going device name (not when the inject is
313 * started as it used to do.)
314 */
315 struct flow_state *flows;
316 unsigned cflows; /* Concurrent flows (config) */
317 unsigned lflow; /* Flow length (config) */
318 unsigned nflows; /* accumulated flows (stats) */
319};
320
321struct pktgen_hdr {
322 __u32 pgh_magic;
323 __u32 seq_num;
324 __u32 tv_sec;
325 __u32 tv_usec;
326};
327
328struct pktgen_thread {
329 spinlock_t if_lock;
330 struct pktgen_dev *if_list; /* All device here */
331 struct pktgen_thread* next;
332 char name[32];
333 char fname[128]; /* name of proc file */
334 struct proc_dir_entry *proc_ent;
335 char result[512];
336 u32 max_before_softirq; /* We'll call do_softirq to prevent starvation. */
337
338 /* Field for thread to receive "posted" events terminate, stop ifs etc.*/
339
340 u32 control;
341 int pid;
342 int cpu;
343
344 wait_queue_head_t queue;
345};
346
347#define REMOVE 1
348#define FIND 0
349
350/* This code works around the fact that do_div cannot handle two 64-bit
351 numbers, and regular 64-bit division doesn't work on x86 kernels.
352 --Ben
353*/
354
355#define PG_DIV 0
356
357/* This was emailed to LMKL by: Chris Caputo <ccaputo@alt.net>
358 * Function copied/adapted/optimized from:
359 *
360 * nemesis.sourceforge.net/browse/lib/static/intmath/ix86/intmath.c.html
361 *
362 * Copyright 1994, University of Cambridge Computer Laboratory
363 * All Rights Reserved.
364 *
365 */
366inline static s64 divremdi3(s64 x, s64 y, int type)
367{
368 u64 a = (x < 0) ? -x : x;
369 u64 b = (y < 0) ? -y : y;
370 u64 res = 0, d = 1;
371
372 if (b > 0) {
373 while (b < a) {
374 b <<= 1;
375 d <<= 1;
376 }
377 }
378
379 do {
380 if ( a >= b ) {
381 a -= b;
382 res += d;
383 }
384 b >>= 1;
385 d >>= 1;
386 }
387 while (d);
388
389 if (PG_DIV == type) {
390 return (((x ^ y) & (1ll<<63)) == 0) ? res : -(s64)res;
391 }
392 else {
393 return ((x & (1ll<<63)) == 0) ? a : -(s64)a;
394 }
395}
396
397/* End of hacks to deal with 64-bit math on x86 */
398
399/** Convert to miliseconds */
400static inline __u64 tv_to_ms(const struct timeval* tv)
401{
402 __u64 ms = tv->tv_usec / 1000;
403 ms += (__u64)tv->tv_sec * (__u64)1000;
404 return ms;
405}
406
407
408/** Convert to micro-seconds */
409static inline __u64 tv_to_us(const struct timeval* tv)
410{
411 __u64 us = tv->tv_usec;
412 us += (__u64)tv->tv_sec * (__u64)1000000;
413 return us;
414}
415
416static inline __u64 pg_div(__u64 n, __u32 base) {
417 __u64 tmp = n;
418 do_div(tmp, base);
419 /* printk("pktgen: pg_div, n: %llu base: %d rv: %llu\n",
420 n, base, tmp); */
421 return tmp;
422}
423
424static inline __u64 pg_div64(__u64 n, __u64 base)
425{
426 __u64 tmp = n;
427/*
428 * How do we know if the architectrure we are running on
429 * supports division with 64 bit base?
430 *
431 */
432#if defined(__sparc_v9__) || defined(__powerpc64__) || defined(__alpha__) || defined(__x86_64__) || defined(__ia64__)
433
434 do_div(tmp, base);
435#else
436 tmp = divremdi3(n, base, PG_DIV);
437#endif
438 return tmp;
439}
440
441static inline u32 pktgen_random(void)
442{
443#if 0
444 __u32 n;
445 get_random_bytes(&n, 4);
446 return n;
447#else
448 return net_random();
449#endif
450}
451
452static inline __u64 getCurMs(void)
453{
454 struct timeval tv;
455 do_gettimeofday(&tv);
456 return tv_to_ms(&tv);
457}
458
459static inline __u64 getCurUs(void)
460{
461 struct timeval tv;
462 do_gettimeofday(&tv);
463 return tv_to_us(&tv);
464}
465
466static inline __u64 tv_diff(const struct timeval* a, const struct timeval* b)
467{
468 return tv_to_us(a) - tv_to_us(b);
469}
470
471
472/* old include end */
473
474static char version[] __initdata = VERSION;
475
476static ssize_t proc_pgctrl_read(struct file* file, char __user * buf, size_t count, loff_t *ppos);
477static ssize_t proc_pgctrl_write(struct file* file, const char __user * buf, size_t count, loff_t *ppos);
478static int proc_if_read(char *buf , char **start, off_t offset, int len, int *eof, void *data);
479
480static int proc_thread_read(char *buf , char **start, off_t offset, int len, int *eof, void *data);
481static int proc_if_write(struct file *file, const char __user *user_buffer, unsigned long count, void *data);
482static int proc_thread_write(struct file *file, const char __user *user_buffer, unsigned long count, void *data);
483static int create_proc_dir(void);
484static int remove_proc_dir(void);
485
486static int pktgen_remove_device(struct pktgen_thread* t, struct pktgen_dev *i);
487static int pktgen_add_device(struct pktgen_thread* t, const char* ifname);
488static struct pktgen_thread* pktgen_find_thread(const char* name);
489static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread* t, const char* ifname);
490static int pktgen_device_event(struct notifier_block *, unsigned long, void *);
491static void pktgen_run_all_threads(void);
492static void pktgen_stop_all_threads_ifs(void);
493static int pktgen_stop_device(struct pktgen_dev *pkt_dev);
494static void pktgen_stop(struct pktgen_thread* t);
495static void pktgen_clear_counters(struct pktgen_dev *pkt_dev);
496static struct pktgen_dev *pktgen_NN_threads(const char* dev_name, int remove);
497static unsigned int scan_ip6(const char *s,char ip[16]);
498static unsigned int fmt_ip6(char *s,const char ip[16]);
499
500/* Module parameters, defaults. */
501static int pg_count_d = 1000; /* 1000 pkts by default */
502static int pg_delay_d = 0;
503static int pg_clone_skb_d = 0;
504static int debug = 0;
505
506static spinlock_t _thread_lock = SPIN_LOCK_UNLOCKED;
507static struct pktgen_thread *pktgen_threads = NULL;
508
509static char module_fname[128];
510static struct proc_dir_entry *module_proc_ent = NULL;
511
512static struct notifier_block pktgen_notifier_block = {
513 .notifier_call = pktgen_device_event,
514};
515
516static struct file_operations pktgen_fops = {
517 .read = proc_pgctrl_read,
518 .write = proc_pgctrl_write,
519 /* .ioctl = pktgen_ioctl, later maybe */
520};
521
522/*
523 * /proc handling functions
524 *
525 */
526
527static struct proc_dir_entry *pg_proc_dir = NULL;
528static int proc_pgctrl_read_eof=0;
529
530static ssize_t proc_pgctrl_read(struct file* file, char __user * buf,
531 size_t count, loff_t *ppos)
532{
533 char data[200];
534 int len = 0;
535
536 if(proc_pgctrl_read_eof) {
537 proc_pgctrl_read_eof=0;
538 len = 0;
539 goto out;
540 }
541
542 sprintf(data, "%s", VERSION);
543
544 len = strlen(data);
545
546 if(len > count) {
547 len =-EFAULT;
548 goto out;
549 }
550
551 if (copy_to_user(buf, data, len)) {
552 len =-EFAULT;
553 goto out;
554 }
555
556 *ppos += len;
557 proc_pgctrl_read_eof=1; /* EOF next call */
558
559 out:
560 return len;
561}
562
563static ssize_t proc_pgctrl_write(struct file* file,const char __user * buf,
564 size_t count, loff_t *ppos)
565{
566 char *data = NULL;
567 int err = 0;
568
569 if (!capable(CAP_NET_ADMIN)){
570 err = -EPERM;
571 goto out;
572 }
573
574 data = (void*)vmalloc ((unsigned int)count);
575
576 if(!data) {
577 err = -ENOMEM;
578 goto out;
579 }
580 if (copy_from_user(data, buf, count)) {
581 err =-EFAULT;
582 goto out_free;
583 }
584 data[count-1] = 0; /* Make string */
585
586 if (!strcmp(data, "stop"))
587 pktgen_stop_all_threads_ifs();
588
589 else if (!strcmp(data, "start"))
590 pktgen_run_all_threads();
591
592 else
593 printk("pktgen: Unknown command: %s\n", data);
594
595 err = count;
596
597 out_free:
598 vfree (data);
599 out:
600 return err;
601}
602
603static int proc_if_read(char *buf , char **start, off_t offset,
604 int len, int *eof, void *data)
605{
606 char *p;
607 int i;
608 struct pktgen_dev *pkt_dev = (struct pktgen_dev*)(data);
609 __u64 sa;
610 __u64 stopped;
611 __u64 now = getCurUs();
612
613 p = buf;
614 p += sprintf(p, "Params: count %llu min_pkt_size: %u max_pkt_size: %u\n",
615 (unsigned long long) pkt_dev->count,
616 pkt_dev->min_pkt_size, pkt_dev->max_pkt_size);
617
618 p += sprintf(p, " frags: %d delay: %u clone_skb: %d ifname: %s\n",
619 pkt_dev->nfrags, 1000*pkt_dev->delay_us+pkt_dev->delay_ns, pkt_dev->clone_skb, pkt_dev->ifname);
620
621 p += sprintf(p, " flows: %u flowlen: %u\n", pkt_dev->cflows, pkt_dev->lflow);
622
623
624 if(pkt_dev->flags & F_IPV6) {
625 char b1[128], b2[128], b3[128];
626 fmt_ip6(b1, pkt_dev->in6_saddr.s6_addr);
627 fmt_ip6(b2, pkt_dev->min_in6_saddr.s6_addr);
628 fmt_ip6(b3, pkt_dev->max_in6_saddr.s6_addr);
629 p += sprintf(p, " saddr: %s min_saddr: %s max_saddr: %s\n", b1, b2, b3);
630
631 fmt_ip6(b1, pkt_dev->in6_daddr.s6_addr);
632 fmt_ip6(b2, pkt_dev->min_in6_daddr.s6_addr);
633 fmt_ip6(b3, pkt_dev->max_in6_daddr.s6_addr);
634 p += sprintf(p, " daddr: %s min_daddr: %s max_daddr: %s\n", b1, b2, b3);
635
636 }
637 else
638 p += sprintf(p, " dst_min: %s dst_max: %s\n src_min: %s src_max: %s\n",
639 pkt_dev->dst_min, pkt_dev->dst_max, pkt_dev->src_min, pkt_dev->src_max);
640
641 p += sprintf(p, " src_mac: ");
642
643 if ((pkt_dev->src_mac[0] == 0) &&
644 (pkt_dev->src_mac[1] == 0) &&
645 (pkt_dev->src_mac[2] == 0) &&
646 (pkt_dev->src_mac[3] == 0) &&
647 (pkt_dev->src_mac[4] == 0) &&
648 (pkt_dev->src_mac[5] == 0))
649
650 for (i = 0; i < 6; i++)
651 p += sprintf(p, "%02X%s", pkt_dev->odev->dev_addr[i], i == 5 ? " " : ":");
652
653 else
654 for (i = 0; i < 6; i++)
655 p += sprintf(p, "%02X%s", pkt_dev->src_mac[i], i == 5 ? " " : ":");
656
657 p += sprintf(p, "dst_mac: ");
658 for (i = 0; i < 6; i++)
659 p += sprintf(p, "%02X%s", pkt_dev->dst_mac[i], i == 5 ? "\n" : ":");
660
661 p += sprintf(p, " udp_src_min: %d udp_src_max: %d udp_dst_min: %d udp_dst_max: %d\n",
662 pkt_dev->udp_src_min, pkt_dev->udp_src_max, pkt_dev->udp_dst_min,
663 pkt_dev->udp_dst_max);
664
665 p += sprintf(p, " src_mac_count: %d dst_mac_count: %d \n Flags: ",
666 pkt_dev->src_mac_count, pkt_dev->dst_mac_count);
667
668
669 if (pkt_dev->flags & F_IPV6)
670 p += sprintf(p, "IPV6 ");
671
672 if (pkt_dev->flags & F_IPSRC_RND)
673 p += sprintf(p, "IPSRC_RND ");
674
675 if (pkt_dev->flags & F_IPDST_RND)
676 p += sprintf(p, "IPDST_RND ");
677
678 if (pkt_dev->flags & F_TXSIZE_RND)
679 p += sprintf(p, "TXSIZE_RND ");
680
681 if (pkt_dev->flags & F_UDPSRC_RND)
682 p += sprintf(p, "UDPSRC_RND ");
683
684 if (pkt_dev->flags & F_UDPDST_RND)
685 p += sprintf(p, "UDPDST_RND ");
686
687 if (pkt_dev->flags & F_MACSRC_RND)
688 p += sprintf(p, "MACSRC_RND ");
689
690 if (pkt_dev->flags & F_MACDST_RND)
691 p += sprintf(p, "MACDST_RND ");
692
693
694 p += sprintf(p, "\n");
695
696 sa = pkt_dev->started_at;
697 stopped = pkt_dev->stopped_at;
698 if (pkt_dev->running)
699 stopped = now; /* not really stopped, more like last-running-at */
700
701 p += sprintf(p, "Current:\n pkts-sofar: %llu errors: %llu\n started: %lluus stopped: %lluus idle: %lluus\n",
702 (unsigned long long) pkt_dev->sofar,
703 (unsigned long long) pkt_dev->errors,
704 (unsigned long long) sa,
705 (unsigned long long) stopped,
706 (unsigned long long) pkt_dev->idle_acc);
707
708 p += sprintf(p, " seq_num: %d cur_dst_mac_offset: %d cur_src_mac_offset: %d\n",
709 pkt_dev->seq_num, pkt_dev->cur_dst_mac_offset, pkt_dev->cur_src_mac_offset);
710
711 if(pkt_dev->flags & F_IPV6) {
712 char b1[128], b2[128];
713 fmt_ip6(b1, pkt_dev->cur_in6_daddr.s6_addr);
714 fmt_ip6(b2, pkt_dev->cur_in6_saddr.s6_addr);
715 p += sprintf(p, " cur_saddr: %s cur_daddr: %s\n", b2, b1);
716 }
717 else
718 p += sprintf(p, " cur_saddr: 0x%x cur_daddr: 0x%x\n",
719 pkt_dev->cur_saddr, pkt_dev->cur_daddr);
720
721
722 p += sprintf(p, " cur_udp_dst: %d cur_udp_src: %d\n",
723 pkt_dev->cur_udp_dst, pkt_dev->cur_udp_src);
724
725 p += sprintf(p, " flows: %u\n", pkt_dev->nflows);
726
727 if (pkt_dev->result[0])
728 p += sprintf(p, "Result: %s\n", pkt_dev->result);
729 else
730 p += sprintf(p, "Result: Idle\n");
731 *eof = 1;
732
733 return p - buf;
734}
735
736
737static int count_trail_chars(const char __user *user_buffer, unsigned int maxlen)
738{
739 int i;
740
741 for (i = 0; i < maxlen; i++) {
742 char c;
743 if (get_user(c, &user_buffer[i]))
744 return -EFAULT;
745 switch (c) {
746 case '\"':
747 case '\n':
748 case '\r':
749 case '\t':
750 case ' ':
751 case '=':
752 break;
753 default:
754 goto done;
755 };
756 }
757done:
758 return i;
759}
760
761static unsigned long num_arg(const char __user *user_buffer, unsigned long maxlen,
762 unsigned long *num)
763{
764 int i = 0;
765 *num = 0;
766
767 for(; i < maxlen; i++) {
768 char c;
769 if (get_user(c, &user_buffer[i]))
770 return -EFAULT;
771 if ((c >= '0') && (c <= '9')) {
772 *num *= 10;
773 *num += c -'0';
774 } else
775 break;
776 }
777 return i;
778}
779
780static int strn_len(const char __user *user_buffer, unsigned int maxlen)
781{
782 int i = 0;
783
784 for(; i < maxlen; i++) {
785 char c;
786 if (get_user(c, &user_buffer[i]))
787 return -EFAULT;
788 switch (c) {
789 case '\"':
790 case '\n':
791 case '\r':
792 case '\t':
793 case ' ':
794 goto done_str;
795 break;
796 default:
797 break;
798 };
799 }
800done_str:
801
802 return i;
803}
804
805static int proc_if_write(struct file *file, const char __user *user_buffer,
806 unsigned long count, void *data)
807{
808 int i = 0, max, len;
809 char name[16], valstr[32];
810 unsigned long value = 0;
811 struct pktgen_dev *pkt_dev = (struct pktgen_dev*)(data);
812 char* pg_result = NULL;
813 int tmp = 0;
814 char buf[128];
815
816 pg_result = &(pkt_dev->result[0]);
817
818 if (count < 1) {
819 printk("pktgen: wrong command format\n");
820 return -EINVAL;
821 }
822
823 max = count - i;
824 tmp = count_trail_chars(&user_buffer[i], max);
825 if (tmp < 0) {
826 printk("pktgen: illegal format\n");
827 return tmp;
828 }
829 i += tmp;
830
831 /* Read variable name */
832
833 len = strn_len(&user_buffer[i], sizeof(name) - 1);
834 if (len < 0) { return len; }
835 memset(name, 0, sizeof(name));
836 if (copy_from_user(name, &user_buffer[i], len) )
837 return -EFAULT;
838 i += len;
839
840 max = count -i;
841 len = count_trail_chars(&user_buffer[i], max);
842 if (len < 0)
843 return len;
844
845 i += len;
846
847 if (debug) {
848 char tb[count + 1];
849 if (copy_from_user(tb, user_buffer, count))
850 return -EFAULT;
851 tb[count] = 0;
852 printk("pktgen: %s,%lu buffer -:%s:-\n", name, count, tb);
853 }
854
855 if (!strcmp(name, "min_pkt_size")) {
856 len = num_arg(&user_buffer[i], 10, &value);
857 if (len < 0) { return len; }
858 i += len;
859 if (value < 14+20+8)
860 value = 14+20+8;
861 if (value != pkt_dev->min_pkt_size) {
862 pkt_dev->min_pkt_size = value;
863 pkt_dev->cur_pkt_size = value;
864 }
865 sprintf(pg_result, "OK: min_pkt_size=%u", pkt_dev->min_pkt_size);
866 return count;
867 }
868
869 if (!strcmp(name, "max_pkt_size")) {
870 len = num_arg(&user_buffer[i], 10, &value);
871 if (len < 0) { return len; }
872 i += len;
873 if (value < 14+20+8)
874 value = 14+20+8;
875 if (value != pkt_dev->max_pkt_size) {
876 pkt_dev->max_pkt_size = value;
877 pkt_dev->cur_pkt_size = value;
878 }
879 sprintf(pg_result, "OK: max_pkt_size=%u", pkt_dev->max_pkt_size);
880 return count;
881 }
882
883 /* Shortcut for min = max */
884
885 if (!strcmp(name, "pkt_size")) {
886 len = num_arg(&user_buffer[i], 10, &value);
887 if (len < 0) { return len; }
888 i += len;
889 if (value < 14+20+8)
890 value = 14+20+8;
891 if (value != pkt_dev->min_pkt_size) {
892 pkt_dev->min_pkt_size = value;
893 pkt_dev->max_pkt_size = value;
894 pkt_dev->cur_pkt_size = value;
895 }
896 sprintf(pg_result, "OK: pkt_size=%u", pkt_dev->min_pkt_size);
897 return count;
898 }
899
900 if (!strcmp(name, "debug")) {
901 len = num_arg(&user_buffer[i], 10, &value);
902 if (len < 0) { return len; }
903 i += len;
904 debug = value;
905 sprintf(pg_result, "OK: debug=%u", debug);
906 return count;
907 }
908
909 if (!strcmp(name, "frags")) {
910 len = num_arg(&user_buffer[i], 10, &value);
911 if (len < 0) { return len; }
912 i += len;
913 pkt_dev->nfrags = value;
914 sprintf(pg_result, "OK: frags=%u", pkt_dev->nfrags);
915 return count;
916 }
917 if (!strcmp(name, "delay")) {
918 len = num_arg(&user_buffer[i], 10, &value);
919 if (len < 0) { return len; }
920 i += len;
921 if (value == 0x7FFFFFFF) {
922 pkt_dev->delay_us = 0x7FFFFFFF;
923 pkt_dev->delay_ns = 0;
924 } else {
925 pkt_dev->delay_us = value / 1000;
926 pkt_dev->delay_ns = value % 1000;
927 }
928 sprintf(pg_result, "OK: delay=%u", 1000*pkt_dev->delay_us+pkt_dev->delay_ns);
929 return count;
930 }
931 if (!strcmp(name, "udp_src_min")) {
932 len = num_arg(&user_buffer[i], 10, &value);
933 if (len < 0) { return len; }
934 i += len;
935 if (value != pkt_dev->udp_src_min) {
936 pkt_dev->udp_src_min = value;
937 pkt_dev->cur_udp_src = value;
938 }
939 sprintf(pg_result, "OK: udp_src_min=%u", pkt_dev->udp_src_min);
940 return count;
941 }
942 if (!strcmp(name, "udp_dst_min")) {
943 len = num_arg(&user_buffer[i], 10, &value);
944 if (len < 0) { return len; }
945 i += len;
946 if (value != pkt_dev->udp_dst_min) {
947 pkt_dev->udp_dst_min = value;
948 pkt_dev->cur_udp_dst = value;
949 }
950 sprintf(pg_result, "OK: udp_dst_min=%u", pkt_dev->udp_dst_min);
951 return count;
952 }
953 if (!strcmp(name, "udp_src_max")) {
954 len = num_arg(&user_buffer[i], 10, &value);
955 if (len < 0) { return len; }
956 i += len;
957 if (value != pkt_dev->udp_src_max) {
958 pkt_dev->udp_src_max = value;
959 pkt_dev->cur_udp_src = value;
960 }
961 sprintf(pg_result, "OK: udp_src_max=%u", pkt_dev->udp_src_max);
962 return count;
963 }
964 if (!strcmp(name, "udp_dst_max")) {
965 len = num_arg(&user_buffer[i], 10, &value);
966 if (len < 0) { return len; }
967 i += len;
968 if (value != pkt_dev->udp_dst_max) {
969 pkt_dev->udp_dst_max = value;
970 pkt_dev->cur_udp_dst = value;
971 }
972 sprintf(pg_result, "OK: udp_dst_max=%u", pkt_dev->udp_dst_max);
973 return count;
974 }
975 if (!strcmp(name, "clone_skb")) {
976 len = num_arg(&user_buffer[i], 10, &value);
977 if (len < 0) { return len; }
978 i += len;
979 pkt_dev->clone_skb = value;
980
981 sprintf(pg_result, "OK: clone_skb=%d", pkt_dev->clone_skb);
982 return count;
983 }
984 if (!strcmp(name, "count")) {
985 len = num_arg(&user_buffer[i], 10, &value);
986 if (len < 0) { return len; }
987 i += len;
988 pkt_dev->count = value;
989 sprintf(pg_result, "OK: count=%llu",
990 (unsigned long long) pkt_dev->count);
991 return count;
992 }
993 if (!strcmp(name, "src_mac_count")) {
994 len = num_arg(&user_buffer[i], 10, &value);
995 if (len < 0) { return len; }
996 i += len;
997 if (pkt_dev->src_mac_count != value) {
998 pkt_dev->src_mac_count = value;
999 pkt_dev->cur_src_mac_offset = 0;
1000 }
1001 sprintf(pg_result, "OK: src_mac_count=%d", pkt_dev->src_mac_count);
1002 return count;
1003 }
1004 if (!strcmp(name, "dst_mac_count")) {
1005 len = num_arg(&user_buffer[i], 10, &value);
1006 if (len < 0) { return len; }
1007 i += len;
1008 if (pkt_dev->dst_mac_count != value) {
1009 pkt_dev->dst_mac_count = value;
1010 pkt_dev->cur_dst_mac_offset = 0;
1011 }
1012 sprintf(pg_result, "OK: dst_mac_count=%d", pkt_dev->dst_mac_count);
1013 return count;
1014 }
1015 if (!strcmp(name, "flag")) {
1016 char f[32];
1017 memset(f, 0, 32);
1018 len = strn_len(&user_buffer[i], sizeof(f) - 1);
1019 if (len < 0) { return len; }
1020 if (copy_from_user(f, &user_buffer[i], len))
1021 return -EFAULT;
1022 i += len;
1023 if (strcmp(f, "IPSRC_RND") == 0)
1024 pkt_dev->flags |= F_IPSRC_RND;
1025
1026 else if (strcmp(f, "!IPSRC_RND") == 0)
1027 pkt_dev->flags &= ~F_IPSRC_RND;
1028
1029 else if (strcmp(f, "TXSIZE_RND") == 0)
1030 pkt_dev->flags |= F_TXSIZE_RND;
1031
1032 else if (strcmp(f, "!TXSIZE_RND") == 0)
1033 pkt_dev->flags &= ~F_TXSIZE_RND;
1034
1035 else if (strcmp(f, "IPDST_RND") == 0)
1036 pkt_dev->flags |= F_IPDST_RND;
1037
1038 else if (strcmp(f, "!IPDST_RND") == 0)
1039 pkt_dev->flags &= ~F_IPDST_RND;
1040
1041 else if (strcmp(f, "UDPSRC_RND") == 0)
1042 pkt_dev->flags |= F_UDPSRC_RND;
1043
1044 else if (strcmp(f, "!UDPSRC_RND") == 0)
1045 pkt_dev->flags &= ~F_UDPSRC_RND;
1046
1047 else if (strcmp(f, "UDPDST_RND") == 0)
1048 pkt_dev->flags |= F_UDPDST_RND;
1049
1050 else if (strcmp(f, "!UDPDST_RND") == 0)
1051 pkt_dev->flags &= ~F_UDPDST_RND;
1052
1053 else if (strcmp(f, "MACSRC_RND") == 0)
1054 pkt_dev->flags |= F_MACSRC_RND;
1055
1056 else if (strcmp(f, "!MACSRC_RND") == 0)
1057 pkt_dev->flags &= ~F_MACSRC_RND;
1058
1059 else if (strcmp(f, "MACDST_RND") == 0)
1060 pkt_dev->flags |= F_MACDST_RND;
1061
1062 else if (strcmp(f, "!MACDST_RND") == 0)
1063 pkt_dev->flags &= ~F_MACDST_RND;
1064
1065 else {
1066 sprintf(pg_result, "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s",
1067 f,
1068 "IPSRC_RND, IPDST_RND, TXSIZE_RND, UDPSRC_RND, UDPDST_RND, MACSRC_RND, MACDST_RND\n");
1069 return count;
1070 }
1071 sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags);
1072 return count;
1073 }
1074 if (!strcmp(name, "dst_min") || !strcmp(name, "dst")) {
1075 len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_min) - 1);
1076 if (len < 0) { return len; }
1077
1078 if (copy_from_user(buf, &user_buffer[i], len))
1079 return -EFAULT;
1080 buf[len] = 0;
1081 if (strcmp(buf, pkt_dev->dst_min) != 0) {
1082 memset(pkt_dev->dst_min, 0, sizeof(pkt_dev->dst_min));
1083 strncpy(pkt_dev->dst_min, buf, len);
1084 pkt_dev->daddr_min = in_aton(pkt_dev->dst_min);
1085 pkt_dev->cur_daddr = pkt_dev->daddr_min;
1086 }
1087 if(debug)
1088 printk("pktgen: dst_min set to: %s\n", pkt_dev->dst_min);
1089 i += len;
1090 sprintf(pg_result, "OK: dst_min=%s", pkt_dev->dst_min);
1091 return count;
1092 }
1093 if (!strcmp(name, "dst_max")) {
1094 len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_max) - 1);
1095 if (len < 0) { return len; }
1096
1097 if (copy_from_user(buf, &user_buffer[i], len))
1098 return -EFAULT;
1099
1100 buf[len] = 0;
1101 if (strcmp(buf, pkt_dev->dst_max) != 0) {
1102 memset(pkt_dev->dst_max, 0, sizeof(pkt_dev->dst_max));
1103 strncpy(pkt_dev->dst_max, buf, len);
1104 pkt_dev->daddr_max = in_aton(pkt_dev->dst_max);
1105 pkt_dev->cur_daddr = pkt_dev->daddr_max;
1106 }
1107 if(debug)
1108 printk("pktgen: dst_max set to: %s\n", pkt_dev->dst_max);
1109 i += len;
1110 sprintf(pg_result, "OK: dst_max=%s", pkt_dev->dst_max);
1111 return count;
1112 }
1113 if (!strcmp(name, "dst6")) {
1114 len = strn_len(&user_buffer[i], sizeof(buf) - 1);
1115 if (len < 0) return len;
1116
1117 pkt_dev->flags |= F_IPV6;
1118
1119 if (copy_from_user(buf, &user_buffer[i], len))
1120 return -EFAULT;
1121 buf[len] = 0;
1122
1123 scan_ip6(buf, pkt_dev->in6_daddr.s6_addr);
1124 fmt_ip6(buf, pkt_dev->in6_daddr.s6_addr);
1125
1126 ipv6_addr_copy(&pkt_dev->cur_in6_daddr, &pkt_dev->in6_daddr);
1127
1128 if(debug)
1129 printk("pktgen: dst6 set to: %s\n", buf);
1130
1131 i += len;
1132 sprintf(pg_result, "OK: dst6=%s", buf);
1133 return count;
1134 }
1135 if (!strcmp(name, "dst6_min")) {
1136 len = strn_len(&user_buffer[i], sizeof(buf) - 1);
1137 if (len < 0) return len;
1138
1139 pkt_dev->flags |= F_IPV6;
1140
1141 if (copy_from_user(buf, &user_buffer[i], len))
1142 return -EFAULT;
1143 buf[len] = 0;
1144
1145 scan_ip6(buf, pkt_dev->min_in6_daddr.s6_addr);
1146 fmt_ip6(buf, pkt_dev->min_in6_daddr.s6_addr);
1147
1148 ipv6_addr_copy(&pkt_dev->cur_in6_daddr, &pkt_dev->min_in6_daddr);
1149 if(debug)
1150 printk("pktgen: dst6_min set to: %s\n", buf);
1151
1152 i += len;
1153 sprintf(pg_result, "OK: dst6_min=%s", buf);
1154 return count;
1155 }
1156 if (!strcmp(name, "dst6_max")) {
1157 len = strn_len(&user_buffer[i], sizeof(buf) - 1);
1158 if (len < 0) return len;
1159
1160 pkt_dev->flags |= F_IPV6;
1161
1162 if (copy_from_user(buf, &user_buffer[i], len))
1163 return -EFAULT;
1164 buf[len] = 0;
1165
1166 scan_ip6(buf, pkt_dev->max_in6_daddr.s6_addr);
1167 fmt_ip6(buf, pkt_dev->max_in6_daddr.s6_addr);
1168
1169 if(debug)
1170 printk("pktgen: dst6_max set to: %s\n", buf);
1171
1172 i += len;
1173 sprintf(pg_result, "OK: dst6_max=%s", buf);
1174 return count;
1175 }
1176 if (!strcmp(name, "src6")) {
1177 len = strn_len(&user_buffer[i], sizeof(buf) - 1);
1178 if (len < 0) return len;
1179
1180 pkt_dev->flags |= F_IPV6;
1181
1182 if (copy_from_user(buf, &user_buffer[i], len))
1183 return -EFAULT;
1184 buf[len] = 0;
1185
1186 scan_ip6(buf, pkt_dev->in6_saddr.s6_addr);
1187 fmt_ip6(buf, pkt_dev->in6_saddr.s6_addr);
1188
1189 ipv6_addr_copy(&pkt_dev->cur_in6_saddr, &pkt_dev->in6_saddr);
1190
1191 if(debug)
1192 printk("pktgen: src6 set to: %s\n", buf);
1193
1194 i += len;
1195 sprintf(pg_result, "OK: src6=%s", buf);
1196 return count;
1197 }
1198 if (!strcmp(name, "src_min")) {
1199 len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_min) - 1);
1200 if (len < 0) { return len; }
1201 if (copy_from_user(buf, &user_buffer[i], len))
1202 return -EFAULT;
1203 buf[len] = 0;
1204 if (strcmp(buf, pkt_dev->src_min) != 0) {
1205 memset(pkt_dev->src_min, 0, sizeof(pkt_dev->src_min));
1206 strncpy(pkt_dev->src_min, buf, len);
1207 pkt_dev->saddr_min = in_aton(pkt_dev->src_min);
1208 pkt_dev->cur_saddr = pkt_dev->saddr_min;
1209 }
1210 if(debug)
1211 printk("pktgen: src_min set to: %s\n", pkt_dev->src_min);
1212 i += len;
1213 sprintf(pg_result, "OK: src_min=%s", pkt_dev->src_min);
1214 return count;
1215 }
1216 if (!strcmp(name, "src_max")) {
1217 len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_max) - 1);
1218 if (len < 0) { return len; }
1219 if (copy_from_user(buf, &user_buffer[i], len))
1220 return -EFAULT;
1221 buf[len] = 0;
1222 if (strcmp(buf, pkt_dev->src_max) != 0) {
1223 memset(pkt_dev->src_max, 0, sizeof(pkt_dev->src_max));
1224 strncpy(pkt_dev->src_max, buf, len);
1225 pkt_dev->saddr_max = in_aton(pkt_dev->src_max);
1226 pkt_dev->cur_saddr = pkt_dev->saddr_max;
1227 }
1228 if(debug)
1229 printk("pktgen: src_max set to: %s\n", pkt_dev->src_max);
1230 i += len;
1231 sprintf(pg_result, "OK: src_max=%s", pkt_dev->src_max);
1232 return count;
1233 }
1234 if (!strcmp(name, "dst_mac")) {
1235 char *v = valstr;
1236 unsigned char old_dmac[6];
1237 unsigned char *m = pkt_dev->dst_mac;
1238 memcpy(old_dmac, pkt_dev->dst_mac, 6);
1239
1240 len = strn_len(&user_buffer[i], sizeof(valstr) - 1);
1241 if (len < 0) { return len; }
1242 memset(valstr, 0, sizeof(valstr));
1243 if( copy_from_user(valstr, &user_buffer[i], len))
1244 return -EFAULT;
1245 i += len;
1246
1247 for(*m = 0;*v && m < pkt_dev->dst_mac + 6; v++) {
1248 if (*v >= '0' && *v <= '9') {
1249 *m *= 16;
1250 *m += *v - '0';
1251 }
1252 if (*v >= 'A' && *v <= 'F') {
1253 *m *= 16;
1254 *m += *v - 'A' + 10;
1255 }
1256 if (*v >= 'a' && *v <= 'f') {
1257 *m *= 16;
1258 *m += *v - 'a' + 10;
1259 }
1260 if (*v == ':') {
1261 m++;
1262 *m = 0;
1263 }
1264 }
1265
1266 /* Set up Dest MAC */
1267 if (memcmp(old_dmac, pkt_dev->dst_mac, 6) != 0)
1268 memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, 6);
1269
1270 sprintf(pg_result, "OK: dstmac");
1271 return count;
1272 }
1273 if (!strcmp(name, "src_mac")) {
1274 char *v = valstr;
1275 unsigned char *m = pkt_dev->src_mac;
1276
1277 len = strn_len(&user_buffer[i], sizeof(valstr) - 1);
1278 if (len < 0) { return len; }
1279 memset(valstr, 0, sizeof(valstr));
1280 if( copy_from_user(valstr, &user_buffer[i], len))
1281 return -EFAULT;
1282 i += len;
1283
1284 for(*m = 0;*v && m < pkt_dev->src_mac + 6; v++) {
1285 if (*v >= '0' && *v <= '9') {
1286 *m *= 16;
1287 *m += *v - '0';
1288 }
1289 if (*v >= 'A' && *v <= 'F') {
1290 *m *= 16;
1291 *m += *v - 'A' + 10;
1292 }
1293 if (*v >= 'a' && *v <= 'f') {
1294 *m *= 16;
1295 *m += *v - 'a' + 10;
1296 }
1297 if (*v == ':') {
1298 m++;
1299 *m = 0;
1300 }
1301 }
1302
1303 sprintf(pg_result, "OK: srcmac");
1304 return count;
1305 }
1306
1307 if (!strcmp(name, "clear_counters")) {
1308 pktgen_clear_counters(pkt_dev);
1309 sprintf(pg_result, "OK: Clearing counters.\n");
1310 return count;
1311 }
1312
1313 if (!strcmp(name, "flows")) {
1314 len = num_arg(&user_buffer[i], 10, &value);
1315 if (len < 0) { return len; }
1316 i += len;
1317 if (value > MAX_CFLOWS)
1318 value = MAX_CFLOWS;
1319
1320 pkt_dev->cflows = value;
1321 sprintf(pg_result, "OK: flows=%u", pkt_dev->cflows);
1322 return count;
1323 }
1324
1325 if (!strcmp(name, "flowlen")) {
1326 len = num_arg(&user_buffer[i], 10, &value);
1327 if (len < 0) { return len; }
1328 i += len;
1329 pkt_dev->lflow = value;
1330 sprintf(pg_result, "OK: flowlen=%u", pkt_dev->lflow);
1331 return count;
1332 }
1333
1334 sprintf(pkt_dev->result, "No such parameter \"%s\"", name);
1335 return -EINVAL;
1336}
1337
1338static int proc_thread_read(char *buf , char **start, off_t offset,
1339 int len, int *eof, void *data)
1340{
1341 char *p;
1342 struct pktgen_thread *t = (struct pktgen_thread*)(data);
1343 struct pktgen_dev *pkt_dev = NULL;
1344
1345
1346 if (!t) {
1347 printk("pktgen: ERROR: could not find thread in proc_thread_read\n");
1348 return -EINVAL;
1349 }
1350
1351 p = buf;
1352 p += sprintf(p, "Name: %s max_before_softirq: %d\n",
1353 t->name, t->max_before_softirq);
1354
1355 p += sprintf(p, "Running: ");
1356
1357 if_lock(t);
1358 for(pkt_dev = t->if_list;pkt_dev; pkt_dev = pkt_dev->next)
1359 if(pkt_dev->running)
1360 p += sprintf(p, "%s ", pkt_dev->ifname);
1361
1362 p += sprintf(p, "\nStopped: ");
1363
1364 for(pkt_dev = t->if_list;pkt_dev; pkt_dev = pkt_dev->next)
1365 if(!pkt_dev->running)
1366 p += sprintf(p, "%s ", pkt_dev->ifname);
1367
1368 if (t->result[0])
1369 p += sprintf(p, "\nResult: %s\n", t->result);
1370 else
1371 p += sprintf(p, "\nResult: NA\n");
1372
1373 *eof = 1;
1374
1375 if_unlock(t);
1376
1377 return p - buf;
1378}
1379
1380static int proc_thread_write(struct file *file, const char __user *user_buffer,
1381 unsigned long count, void *data)
1382{
1383 int i = 0, max, len, ret;
1384 char name[40];
1385 struct pktgen_thread *t;
1386 char *pg_result;
1387 unsigned long value = 0;
1388
1389 if (count < 1) {
1390 // sprintf(pg_result, "Wrong command format");
1391 return -EINVAL;
1392 }
1393
1394 max = count - i;
1395 len = count_trail_chars(&user_buffer[i], max);
1396 if (len < 0)
1397 return len;
1398
1399 i += len;
1400
1401 /* Read variable name */
1402
1403 len = strn_len(&user_buffer[i], sizeof(name) - 1);
1404 if (len < 0)
1405 return len;
1406
1407 memset(name, 0, sizeof(name));
1408 if (copy_from_user(name, &user_buffer[i], len))
1409 return -EFAULT;
1410 i += len;
1411
1412 max = count -i;
1413 len = count_trail_chars(&user_buffer[i], max);
1414 if (len < 0)
1415 return len;
1416
1417 i += len;
1418
1419 if (debug)
1420 printk("pktgen: t=%s, count=%lu\n", name, count);
1421
1422
1423 t = (struct pktgen_thread*)(data);
1424 if(!t) {
1425 printk("pktgen: ERROR: No thread\n");
1426 ret = -EINVAL;
1427 goto out;
1428 }
1429
1430 pg_result = &(t->result[0]);
1431
1432 if (!strcmp(name, "add_device")) {
1433 char f[32];
1434 memset(f, 0, 32);
1435 len = strn_len(&user_buffer[i], sizeof(f) - 1);
1436 if (len < 0) {
1437 ret = len;
1438 goto out;
1439 }
1440 if( copy_from_user(f, &user_buffer[i], len) )
1441 return -EFAULT;
1442 i += len;
1443 thread_lock();
1444 pktgen_add_device(t, f);
1445 thread_unlock();
1446 ret = count;
1447 sprintf(pg_result, "OK: add_device=%s", f);
1448 goto out;
1449 }
1450
1451 if (!strcmp(name, "rem_device_all")) {
1452 thread_lock();
1453 t->control |= T_REMDEV;
1454 thread_unlock();
1455 current->state = TASK_INTERRUPTIBLE;
1456 schedule_timeout(HZ/8); /* Propagate thread->control */
1457 ret = count;
1458 sprintf(pg_result, "OK: rem_device_all");
1459 goto out;
1460 }
1461
1462 if (!strcmp(name, "max_before_softirq")) {
1463 len = num_arg(&user_buffer[i], 10, &value);
1464 thread_lock();
1465 t->max_before_softirq = value;
1466 thread_unlock();
1467 ret = count;
1468 sprintf(pg_result, "OK: max_before_softirq=%lu", value);
1469 goto out;
1470 }
1471
1472 ret = -EINVAL;
1473 out:
1474
1475 return ret;
1476}
1477
1478static int create_proc_dir(void)
1479{
1480 int len;
1481 /* does proc_dir already exists */
1482 len = strlen(PG_PROC_DIR);
1483
1484 for (pg_proc_dir = proc_net->subdir; pg_proc_dir; pg_proc_dir=pg_proc_dir->next) {
1485 if ((pg_proc_dir->namelen == len) &&
1486 (! memcmp(pg_proc_dir->name, PG_PROC_DIR, len)))
1487 break;
1488 }
1489
1490 if (!pg_proc_dir)
1491 pg_proc_dir = create_proc_entry(PG_PROC_DIR, S_IFDIR, proc_net);
1492
1493 if (!pg_proc_dir)
1494 return -ENODEV;
1495
1496 return 0;
1497}
1498
1499static int remove_proc_dir(void)
1500{
1501 remove_proc_entry(PG_PROC_DIR, proc_net);
1502 return 0;
1503}
1504
1505/* Think find or remove for NN */
1506static struct pktgen_dev *__pktgen_NN_threads(const char* ifname, int remove)
1507{
1508 struct pktgen_thread *t;
1509 struct pktgen_dev *pkt_dev = NULL;
1510
1511 t = pktgen_threads;
1512
1513 while (t) {
1514 pkt_dev = pktgen_find_dev(t, ifname);
1515 if (pkt_dev) {
1516 if(remove) {
1517 if_lock(t);
1518 pktgen_remove_device(t, pkt_dev);
1519 if_unlock(t);
1520 }
1521 break;
1522 }
1523 t = t->next;
1524 }
1525 return pkt_dev;
1526}
1527
1528static struct pktgen_dev *pktgen_NN_threads(const char* ifname, int remove)
1529{
1530 struct pktgen_dev *pkt_dev = NULL;
1531 thread_lock();
1532 pkt_dev = __pktgen_NN_threads(ifname, remove);
1533 thread_unlock();
1534 return pkt_dev;
1535}
1536
1537static int pktgen_device_event(struct notifier_block *unused, unsigned long event, void *ptr)
1538{
1539 struct net_device *dev = (struct net_device *)(ptr);
1540
1541 /* It is OK that we do not hold the group lock right now,
1542 * as we run under the RTNL lock.
1543 */
1544
1545 switch (event) {
1546 case NETDEV_CHANGEADDR:
1547 case NETDEV_GOING_DOWN:
1548 case NETDEV_DOWN:
1549 case NETDEV_UP:
1550 /* Ignore for now */
1551 break;
1552
1553 case NETDEV_UNREGISTER:
1554 pktgen_NN_threads(dev->name, REMOVE);
1555 break;
1556 };
1557
1558 return NOTIFY_DONE;
1559}
1560
1561/* Associate pktgen_dev with a device. */
1562
1563static struct net_device* pktgen_setup_dev(struct pktgen_dev *pkt_dev) {
1564 struct net_device *odev;
1565
1566 /* Clean old setups */
1567
1568 if (pkt_dev->odev) {
1569 dev_put(pkt_dev->odev);
1570 pkt_dev->odev = NULL;
1571 }
1572
1573 odev = dev_get_by_name(pkt_dev->ifname);
1574
1575 if (!odev) {
1576 printk("pktgen: no such netdevice: \"%s\"\n", pkt_dev->ifname);
1577 goto out;
1578 }
1579 if (odev->type != ARPHRD_ETHER) {
1580 printk("pktgen: not an ethernet device: \"%s\"\n", pkt_dev->ifname);
1581 goto out_put;
1582 }
1583 if (!netif_running(odev)) {
1584 printk("pktgen: device is down: \"%s\"\n", pkt_dev->ifname);
1585 goto out_put;
1586 }
1587 pkt_dev->odev = odev;
1588
1589 return pkt_dev->odev;
1590
1591out_put:
1592 dev_put(odev);
1593out:
1594 return NULL;
1595
1596}
1597
1598/* Read pkt_dev from the interface and set up internal pktgen_dev
1599 * structure to have the right information to create/send packets
1600 */
1601static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
1602{
1603 /* Try once more, just in case it works now. */
1604 if (!pkt_dev->odev)
1605 pktgen_setup_dev(pkt_dev);
1606
1607 if (!pkt_dev->odev) {
1608 printk("pktgen: ERROR: pkt_dev->odev == NULL in setup_inject.\n");
1609 sprintf(pkt_dev->result, "ERROR: pkt_dev->odev == NULL in setup_inject.\n");
1610 return;
1611 }
1612
1613 /* Default to the interface's mac if not explicitly set. */
1614
1615 if ((pkt_dev->src_mac[0] == 0) &&
1616 (pkt_dev->src_mac[1] == 0) &&
1617 (pkt_dev->src_mac[2] == 0) &&
1618 (pkt_dev->src_mac[3] == 0) &&
1619 (pkt_dev->src_mac[4] == 0) &&
1620 (pkt_dev->src_mac[5] == 0)) {
1621
1622 memcpy(&(pkt_dev->hh[6]), pkt_dev->odev->dev_addr, 6);
1623 }
1624 /* Set up Dest MAC */
1625 memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, 6);
1626
1627 /* Set up pkt size */
1628 pkt_dev->cur_pkt_size = pkt_dev->min_pkt_size;
1629
1630 if(pkt_dev->flags & F_IPV6) {
1631 /*
1632 * Skip this automatic address setting until locks or functions
1633 * gets exported
1634 */
1635
1636#ifdef NOTNOW
1637 int i, set = 0, err=1;
1638 struct inet6_dev *idev;
1639
1640 for(i=0; i< IN6_ADDR_HSIZE; i++)
1641 if(pkt_dev->cur_in6_saddr.s6_addr[i]) {
1642 set = 1;
1643 break;
1644 }
1645
1646 if(!set) {
1647
1648 /*
1649 * Use linklevel address if unconfigured.
1650 *
1651 * use ipv6_get_lladdr if/when it's get exported
1652 */
1653
1654
1655 read_lock(&addrconf_lock);
1656 if ((idev = __in6_dev_get(pkt_dev->odev)) != NULL) {
1657 struct inet6_ifaddr *ifp;
1658
1659 read_lock_bh(&idev->lock);
1660 for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) {
1661 if (ifp->scope == IFA_LINK && !(ifp->flags&IFA_F_TENTATIVE)) {
1662 ipv6_addr_copy(&pkt_dev->cur_in6_saddr, &ifp->addr);
1663 err = 0;
1664 break;
1665 }
1666 }
1667 read_unlock_bh(&idev->lock);
1668 }
1669 read_unlock(&addrconf_lock);
1670 if(err) printk("pktgen: ERROR: IPv6 link address not availble.\n");
1671 }
1672#endif
1673 }
1674 else {
1675 pkt_dev->saddr_min = 0;
1676 pkt_dev->saddr_max = 0;
1677 if (strlen(pkt_dev->src_min) == 0) {
1678
1679 struct in_device *in_dev;
1680
1681 rcu_read_lock();
1682 in_dev = __in_dev_get(pkt_dev->odev);
1683 if (in_dev) {
1684 if (in_dev->ifa_list) {
1685 pkt_dev->saddr_min = in_dev->ifa_list->ifa_address;
1686 pkt_dev->saddr_max = pkt_dev->saddr_min;
1687 }
1688 __in_dev_put(in_dev);
1689 }
1690 rcu_read_unlock();
1691 }
1692 else {
1693 pkt_dev->saddr_min = in_aton(pkt_dev->src_min);
1694 pkt_dev->saddr_max = in_aton(pkt_dev->src_max);
1695 }
1696
1697 pkt_dev->daddr_min = in_aton(pkt_dev->dst_min);
1698 pkt_dev->daddr_max = in_aton(pkt_dev->dst_max);
1699 }
1700 /* Initialize current values. */
1701 pkt_dev->cur_dst_mac_offset = 0;
1702 pkt_dev->cur_src_mac_offset = 0;
1703 pkt_dev->cur_saddr = pkt_dev->saddr_min;
1704 pkt_dev->cur_daddr = pkt_dev->daddr_min;
1705 pkt_dev->cur_udp_dst = pkt_dev->udp_dst_min;
1706 pkt_dev->cur_udp_src = pkt_dev->udp_src_min;
1707 pkt_dev->nflows = 0;
1708}
1709
1710static void spin(struct pktgen_dev *pkt_dev, __u64 spin_until_us)
1711{
1712 __u64 start;
1713 __u64 now;
1714
1715 start = now = getCurUs();
1716 printk(KERN_INFO "sleeping for %d\n", (int)(spin_until_us - now));
1717 while (now < spin_until_us) {
1718 /* TODO: optimise sleeping behavior */
1719 if (spin_until_us - now > (1000000/HZ)+1) {
1720 current->state = TASK_INTERRUPTIBLE;
1721 schedule_timeout(1);
1722 } else if (spin_until_us - now > 100) {
1723 do_softirq();
1724 if (!pkt_dev->running)
1725 return;
1726 if (need_resched())
1727 schedule();
1728 }
1729
1730 now = getCurUs();
1731 }
1732
1733 pkt_dev->idle_acc += now - start;
1734}
1735
1736
1737/* Increment/randomize headers according to flags and current values
1738 * for IP src/dest, UDP src/dst port, MAC-Addr src/dst
1739 */
1740static void mod_cur_headers(struct pktgen_dev *pkt_dev) {
1741 __u32 imn;
1742 __u32 imx;
1743 int flow = 0;
1744
1745 if(pkt_dev->cflows) {
1746 flow = pktgen_random() % pkt_dev->cflows;
1747
1748 if (pkt_dev->flows[flow].count > pkt_dev->lflow)
1749 pkt_dev->flows[flow].count = 0;
1750 }
1751
1752
1753 /* Deal with source MAC */
1754 if (pkt_dev->src_mac_count > 1) {
1755 __u32 mc;
1756 __u32 tmp;
1757
1758 if (pkt_dev->flags & F_MACSRC_RND)
1759 mc = pktgen_random() % (pkt_dev->src_mac_count);
1760 else {
1761 mc = pkt_dev->cur_src_mac_offset++;
1762 if (pkt_dev->cur_src_mac_offset > pkt_dev->src_mac_count)
1763 pkt_dev->cur_src_mac_offset = 0;
1764 }
1765
1766 tmp = pkt_dev->src_mac[5] + (mc & 0xFF);
1767 pkt_dev->hh[11] = tmp;
1768 tmp = (pkt_dev->src_mac[4] + ((mc >> 8) & 0xFF) + (tmp >> 8));
1769 pkt_dev->hh[10] = tmp;
1770 tmp = (pkt_dev->src_mac[3] + ((mc >> 16) & 0xFF) + (tmp >> 8));
1771 pkt_dev->hh[9] = tmp;
1772 tmp = (pkt_dev->src_mac[2] + ((mc >> 24) & 0xFF) + (tmp >> 8));
1773 pkt_dev->hh[8] = tmp;
1774 tmp = (pkt_dev->src_mac[1] + (tmp >> 8));
1775 pkt_dev->hh[7] = tmp;
1776 }
1777
1778 /* Deal with Destination MAC */
1779 if (pkt_dev->dst_mac_count > 1) {
1780 __u32 mc;
1781 __u32 tmp;
1782
1783 if (pkt_dev->flags & F_MACDST_RND)
1784 mc = pktgen_random() % (pkt_dev->dst_mac_count);
1785
1786 else {
1787 mc = pkt_dev->cur_dst_mac_offset++;
1788 if (pkt_dev->cur_dst_mac_offset > pkt_dev->dst_mac_count) {
1789 pkt_dev->cur_dst_mac_offset = 0;
1790 }
1791 }
1792
1793 tmp = pkt_dev->dst_mac[5] + (mc & 0xFF);
1794 pkt_dev->hh[5] = tmp;
1795 tmp = (pkt_dev->dst_mac[4] + ((mc >> 8) & 0xFF) + (tmp >> 8));
1796 pkt_dev->hh[4] = tmp;
1797 tmp = (pkt_dev->dst_mac[3] + ((mc >> 16) & 0xFF) + (tmp >> 8));
1798 pkt_dev->hh[3] = tmp;
1799 tmp = (pkt_dev->dst_mac[2] + ((mc >> 24) & 0xFF) + (tmp >> 8));
1800 pkt_dev->hh[2] = tmp;
1801 tmp = (pkt_dev->dst_mac[1] + (tmp >> 8));
1802 pkt_dev->hh[1] = tmp;
1803 }
1804
1805 if (pkt_dev->udp_src_min < pkt_dev->udp_src_max) {
1806 if (pkt_dev->flags & F_UDPSRC_RND)
1807 pkt_dev->cur_udp_src = ((pktgen_random() % (pkt_dev->udp_src_max - pkt_dev->udp_src_min)) + pkt_dev->udp_src_min);
1808
1809 else {
1810 pkt_dev->cur_udp_src++;
1811 if (pkt_dev->cur_udp_src >= pkt_dev->udp_src_max)
1812 pkt_dev->cur_udp_src = pkt_dev->udp_src_min;
1813 }
1814 }
1815
1816 if (pkt_dev->udp_dst_min < pkt_dev->udp_dst_max) {
1817 if (pkt_dev->flags & F_UDPDST_RND) {
1818 pkt_dev->cur_udp_dst = ((pktgen_random() % (pkt_dev->udp_dst_max - pkt_dev->udp_dst_min)) + pkt_dev->udp_dst_min);
1819 }
1820 else {
1821 pkt_dev->cur_udp_dst++;
1822 if (pkt_dev->cur_udp_dst >= pkt_dev->udp_dst_max)
1823 pkt_dev->cur_udp_dst = pkt_dev->udp_dst_min;
1824 }
1825 }
1826
1827 if (!(pkt_dev->flags & F_IPV6)) {
1828
1829 if ((imn = ntohl(pkt_dev->saddr_min)) < (imx = ntohl(pkt_dev->saddr_max))) {
1830 __u32 t;
1831 if (pkt_dev->flags & F_IPSRC_RND)
1832 t = ((pktgen_random() % (imx - imn)) + imn);
1833 else {
1834 t = ntohl(pkt_dev->cur_saddr);
1835 t++;
1836 if (t > imx) {
1837 t = imn;
1838 }
1839 }
1840 pkt_dev->cur_saddr = htonl(t);
1841 }
1842
1843 if (pkt_dev->cflows && pkt_dev->flows[flow].count != 0) {
1844 pkt_dev->cur_daddr = pkt_dev->flows[flow].cur_daddr;
1845 } else {
1846
1847 if ((imn = ntohl(pkt_dev->daddr_min)) < (imx = ntohl(pkt_dev->daddr_max))) {
1848 __u32 t;
1849 if (pkt_dev->flags & F_IPDST_RND) {
1850
1851 t = ((pktgen_random() % (imx - imn)) + imn);
1852 t = htonl(t);
1853
1854 while( LOOPBACK(t) || MULTICAST(t) || BADCLASS(t) || ZERONET(t) || LOCAL_MCAST(t) ) {
1855 t = ((pktgen_random() % (imx - imn)) + imn);
1856 t = htonl(t);
1857 }
1858 pkt_dev->cur_daddr = t;
1859 }
1860
1861 else {
1862 t = ntohl(pkt_dev->cur_daddr);
1863 t++;
1864 if (t > imx) {
1865 t = imn;
1866 }
1867 pkt_dev->cur_daddr = htonl(t);
1868 }
1869 }
1870 if(pkt_dev->cflows) {
1871 pkt_dev->flows[flow].cur_daddr = pkt_dev->cur_daddr;
1872 pkt_dev->nflows++;
1873 }
1874 }
1875 }
1876 else /* IPV6 * */
1877 {
1878 if(pkt_dev->min_in6_daddr.s6_addr32[0] == 0 &&
1879 pkt_dev->min_in6_daddr.s6_addr32[1] == 0 &&
1880 pkt_dev->min_in6_daddr.s6_addr32[2] == 0 &&
1881 pkt_dev->min_in6_daddr.s6_addr32[3] == 0);
1882 else {
1883 int i;
1884
1885 /* Only random destinations yet */
1886
1887 for(i=0; i < 4; i++) {
1888 pkt_dev->cur_in6_daddr.s6_addr32[i] =
1889 ((pktgen_random() |
1890 pkt_dev->min_in6_daddr.s6_addr32[i]) &
1891 pkt_dev->max_in6_daddr.s6_addr32[i]);
1892 }
1893 }
1894 }
1895
1896 if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) {
1897 __u32 t;
1898 if (pkt_dev->flags & F_TXSIZE_RND) {
1899 t = ((pktgen_random() % (pkt_dev->max_pkt_size - pkt_dev->min_pkt_size))
1900 + pkt_dev->min_pkt_size);
1901 }
1902 else {
1903 t = pkt_dev->cur_pkt_size + 1;
1904 if (t > pkt_dev->max_pkt_size)
1905 t = pkt_dev->min_pkt_size;
1906 }
1907 pkt_dev->cur_pkt_size = t;
1908 }
1909
1910 pkt_dev->flows[flow].count++;
1911}
1912
1913
1914static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
1915 struct pktgen_dev *pkt_dev)
1916{
1917 struct sk_buff *skb = NULL;
1918 __u8 *eth;
1919 struct udphdr *udph;
1920 int datalen, iplen;
1921 struct iphdr *iph;
1922 struct pktgen_hdr *pgh = NULL;
1923
1924 skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC);
1925 if (!skb) {
1926 sprintf(pkt_dev->result, "No memory");
1927 return NULL;
1928 }
1929
1930 skb_reserve(skb, 16);
1931
1932 /* Reserve for ethernet and IP header */
1933 eth = (__u8 *) skb_push(skb, 14);
1934 iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr));
1935 udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr));
1936
1937 /* Update any of the values, used when we're incrementing various
1938 * fields.
1939 */
1940 mod_cur_headers(pkt_dev);
1941
1942 memcpy(eth, pkt_dev->hh, 12);
1943 *(u16*)&eth[12] = __constant_htons(ETH_P_IP);
1944
1945 datalen = pkt_dev->cur_pkt_size - 14 - 20 - 8; /* Eth + IPh + UDPh */
1946 if (datalen < sizeof(struct pktgen_hdr))
1947 datalen = sizeof(struct pktgen_hdr);
1948
1949 udph->source = htons(pkt_dev->cur_udp_src);
1950 udph->dest = htons(pkt_dev->cur_udp_dst);
1951 udph->len = htons(datalen + 8); /* DATA + udphdr */
1952 udph->check = 0; /* No checksum */
1953
1954 iph->ihl = 5;
1955 iph->version = 4;
1956 iph->ttl = 32;
1957 iph->tos = 0;
1958 iph->protocol = IPPROTO_UDP; /* UDP */
1959 iph->saddr = pkt_dev->cur_saddr;
1960 iph->daddr = pkt_dev->cur_daddr;
1961 iph->frag_off = 0;
1962 iplen = 20 + 8 + datalen;
1963 iph->tot_len = htons(iplen);
1964 iph->check = 0;
1965 iph->check = ip_fast_csum((void *) iph, iph->ihl);
1966 skb->protocol = __constant_htons(ETH_P_IP);
1967 skb->mac.raw = ((u8 *)iph) - 14;
1968 skb->dev = odev;
1969 skb->pkt_type = PACKET_HOST;
1970
1971 if (pkt_dev->nfrags <= 0)
1972 pgh = (struct pktgen_hdr *)skb_put(skb, datalen);
1973 else {
1974 int frags = pkt_dev->nfrags;
1975 int i;
1976
1977 pgh = (struct pktgen_hdr*)(((char*)(udph)) + 8);
1978
1979 if (frags > MAX_SKB_FRAGS)
1980 frags = MAX_SKB_FRAGS;
1981 if (datalen > frags*PAGE_SIZE) {
1982 skb_put(skb, datalen-frags*PAGE_SIZE);
1983 datalen = frags*PAGE_SIZE;
1984 }
1985
1986 i = 0;
1987 while (datalen > 0) {
1988 struct page *page = alloc_pages(GFP_KERNEL, 0);
1989 skb_shinfo(skb)->frags[i].page = page;
1990 skb_shinfo(skb)->frags[i].page_offset = 0;
1991 skb_shinfo(skb)->frags[i].size =
1992 (datalen < PAGE_SIZE ? datalen : PAGE_SIZE);
1993 datalen -= skb_shinfo(skb)->frags[i].size;
1994 skb->len += skb_shinfo(skb)->frags[i].size;
1995 skb->data_len += skb_shinfo(skb)->frags[i].size;
1996 i++;
1997 skb_shinfo(skb)->nr_frags = i;
1998 }
1999
2000 while (i < frags) {
2001 int rem;
2002
2003 if (i == 0)
2004 break;
2005
2006 rem = skb_shinfo(skb)->frags[i - 1].size / 2;
2007 if (rem == 0)
2008 break;
2009
2010 skb_shinfo(skb)->frags[i - 1].size -= rem;
2011
2012 skb_shinfo(skb)->frags[i] = skb_shinfo(skb)->frags[i - 1];
2013 get_page(skb_shinfo(skb)->frags[i].page);
2014 skb_shinfo(skb)->frags[i].page = skb_shinfo(skb)->frags[i - 1].page;
2015 skb_shinfo(skb)->frags[i].page_offset += skb_shinfo(skb)->frags[i - 1].size;
2016 skb_shinfo(skb)->frags[i].size = rem;
2017 i++;
2018 skb_shinfo(skb)->nr_frags = i;
2019 }
2020 }
2021
2022 /* Stamp the time, and sequence number, convert them to network byte order */
2023
2024 if (pgh) {
2025 struct timeval timestamp;
2026
2027 pgh->pgh_magic = htonl(PKTGEN_MAGIC);
2028 pgh->seq_num = htonl(pkt_dev->seq_num);
2029
2030 do_gettimeofday(&timestamp);
2031 pgh->tv_sec = htonl(timestamp.tv_sec);
2032 pgh->tv_usec = htonl(timestamp.tv_usec);
2033 }
2034 pkt_dev->seq_num++;
2035
2036 return skb;
2037}
2038
2039/*
2040 * scan_ip6, fmt_ip taken from dietlibc-0.21
2041 * Author Felix von Leitner <felix-dietlibc@fefe.de>
2042 *
2043 * Slightly modified for kernel.
2044 * Should be candidate for net/ipv4/utils.c
2045 * --ro
2046 */
2047
2048static unsigned int scan_ip6(const char *s,char ip[16])
2049{
2050 unsigned int i;
2051 unsigned int len=0;
2052 unsigned long u;
2053 char suffix[16];
2054 unsigned int prefixlen=0;
2055 unsigned int suffixlen=0;
2056 __u32 tmp;
2057
2058 for (i=0; i<16; i++) ip[i]=0;
2059
2060 for (;;) {
2061 if (*s == ':') {
2062 len++;
2063 if (s[1] == ':') { /* Found "::", skip to part 2 */
2064 s+=2;
2065 len++;
2066 break;
2067 }
2068 s++;
2069 }
2070 {
2071 char *tmp;
2072 u=simple_strtoul(s,&tmp,16);
2073 i=tmp-s;
2074 }
2075
2076 if (!i) return 0;
2077 if (prefixlen==12 && s[i]=='.') {
2078
2079 /* the last 4 bytes may be written as IPv4 address */
2080
2081 tmp = in_aton(s);
2082 memcpy((struct in_addr*)(ip+12), &tmp, sizeof(tmp));
2083 return i+len;
2084 }
2085 ip[prefixlen++] = (u >> 8);
2086 ip[prefixlen++] = (u & 255);
2087 s += i; len += i;
2088 if (prefixlen==16)
2089 return len;
2090 }
2091
2092/* part 2, after "::" */
2093 for (;;) {
2094 if (*s == ':') {
2095 if (suffixlen==0)
2096 break;
2097 s++;
2098 len++;
2099 } else if (suffixlen!=0)
2100 break;
2101 {
2102 char *tmp;
2103 u=simple_strtol(s,&tmp,16);
2104 i=tmp-s;
2105 }
2106 if (!i) {
2107 if (*s) len--;
2108 break;
2109 }
2110 if (suffixlen+prefixlen<=12 && s[i]=='.') {
2111 tmp = in_aton(s);
2112 memcpy((struct in_addr*)(suffix+suffixlen), &tmp, sizeof(tmp));
2113 suffixlen+=4;
2114 len+=strlen(s);
2115 break;
2116 }
2117 suffix[suffixlen++] = (u >> 8);
2118 suffix[suffixlen++] = (u & 255);
2119 s += i; len += i;
2120 if (prefixlen+suffixlen==16)
2121 break;
2122 }
2123 for (i=0; i<suffixlen; i++)
2124 ip[16-suffixlen+i] = suffix[i];
2125 return len;
2126}
2127
2128static char tohex(char hexdigit) {
2129 return hexdigit>9?hexdigit+'a'-10:hexdigit+'0';
2130}
2131
2132static int fmt_xlong(char* s,unsigned int i) {
2133 char* bak=s;
2134 *s=tohex((i>>12)&0xf); if (s!=bak || *s!='0') ++s;
2135 *s=tohex((i>>8)&0xf); if (s!=bak || *s!='0') ++s;
2136 *s=tohex((i>>4)&0xf); if (s!=bak || *s!='0') ++s;
2137 *s=tohex(i&0xf);
2138 return s-bak+1;
2139}
2140
2141static unsigned int fmt_ip6(char *s,const char ip[16]) {
2142 unsigned int len;
2143 unsigned int i;
2144 unsigned int temp;
2145 unsigned int compressing;
2146 int j;
2147
2148 len = 0; compressing = 0;
2149 for (j=0; j<16; j+=2) {
2150
2151#ifdef V4MAPPEDPREFIX
2152 if (j==12 && !memcmp(ip,V4mappedprefix,12)) {
2153 inet_ntoa_r(*(struct in_addr*)(ip+12),s);
2154 temp=strlen(s);
2155 return len+temp;
2156 }
2157#endif
2158 temp = ((unsigned long) (unsigned char) ip[j] << 8) +
2159 (unsigned long) (unsigned char) ip[j+1];
2160 if (temp == 0) {
2161 if (!compressing) {
2162 compressing=1;
2163 if (j==0) {
2164 *s++=':'; ++len;
2165 }
2166 }
2167 } else {
2168 if (compressing) {
2169 compressing=0;
2170 *s++=':'; ++len;
2171 }
2172 i = fmt_xlong(s,temp); len += i; s += i;
2173 if (j<14) {
2174 *s++ = ':';
2175 ++len;
2176 }
2177 }
2178 }
2179 if (compressing) {
2180 *s++=':'; ++len;
2181 }
2182 *s=0;
2183 return len;
2184}
2185
2186static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
2187 struct pktgen_dev *pkt_dev)
2188{
2189 struct sk_buff *skb = NULL;
2190 __u8 *eth;
2191 struct udphdr *udph;
2192 int datalen;
2193 struct ipv6hdr *iph;
2194 struct pktgen_hdr *pgh = NULL;
2195
2196 skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC);
2197 if (!skb) {
2198 sprintf(pkt_dev->result, "No memory");
2199 return NULL;
2200 }
2201
2202 skb_reserve(skb, 16);
2203
2204 /* Reserve for ethernet and IP header */
2205 eth = (__u8 *) skb_push(skb, 14);
2206 iph = (struct ipv6hdr *)skb_put(skb, sizeof(struct ipv6hdr));
2207 udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr));
2208
2209
2210 /* Update any of the values, used when we're incrementing various
2211 * fields.
2212 */
2213 mod_cur_headers(pkt_dev);
2214
2215
2216 memcpy(eth, pkt_dev->hh, 12);
2217 *(u16*)&eth[12] = __constant_htons(ETH_P_IPV6);
2218
2219
2220 datalen = pkt_dev->cur_pkt_size-14-
2221 sizeof(struct ipv6hdr)-sizeof(struct udphdr); /* Eth + IPh + UDPh */
2222
2223 if (datalen < sizeof(struct pktgen_hdr)) {
2224 datalen = sizeof(struct pktgen_hdr);
2225 if (net_ratelimit())
2226 printk(KERN_INFO "pktgen: increased datalen to %d\n", datalen);
2227 }
2228
2229 udph->source = htons(pkt_dev->cur_udp_src);
2230 udph->dest = htons(pkt_dev->cur_udp_dst);
2231 udph->len = htons(datalen + sizeof(struct udphdr));
2232 udph->check = 0; /* No checksum */
2233
2234 *(u32*)iph = __constant_htonl(0x60000000); /* Version + flow */
2235
2236 iph->hop_limit = 32;
2237
2238 iph->payload_len = htons(sizeof(struct udphdr) + datalen);
2239 iph->nexthdr = IPPROTO_UDP;
2240
2241 ipv6_addr_copy(&iph->daddr, &pkt_dev->cur_in6_daddr);
2242 ipv6_addr_copy(&iph->saddr, &pkt_dev->cur_in6_saddr);
2243
2244 skb->mac.raw = ((u8 *)iph) - 14;
2245 skb->protocol = __constant_htons(ETH_P_IPV6);
2246 skb->dev = odev;
2247 skb->pkt_type = PACKET_HOST;
2248
2249 if (pkt_dev->nfrags <= 0)
2250 pgh = (struct pktgen_hdr *)skb_put(skb, datalen);
2251 else {
2252 int frags = pkt_dev->nfrags;
2253 int i;
2254
2255 pgh = (struct pktgen_hdr*)(((char*)(udph)) + 8);
2256
2257 if (frags > MAX_SKB_FRAGS)
2258 frags = MAX_SKB_FRAGS;
2259 if (datalen > frags*PAGE_SIZE) {
2260 skb_put(skb, datalen-frags*PAGE_SIZE);
2261 datalen = frags*PAGE_SIZE;
2262 }
2263
2264 i = 0;
2265 while (datalen > 0) {
2266 struct page *page = alloc_pages(GFP_KERNEL, 0);
2267 skb_shinfo(skb)->frags[i].page = page;
2268 skb_shinfo(skb)->frags[i].page_offset = 0;
2269 skb_shinfo(skb)->frags[i].size =
2270 (datalen < PAGE_SIZE ? datalen : PAGE_SIZE);
2271 datalen -= skb_shinfo(skb)->frags[i].size;
2272 skb->len += skb_shinfo(skb)->frags[i].size;
2273 skb->data_len += skb_shinfo(skb)->frags[i].size;
2274 i++;
2275 skb_shinfo(skb)->nr_frags = i;
2276 }
2277
2278 while (i < frags) {
2279 int rem;
2280
2281 if (i == 0)
2282 break;
2283
2284 rem = skb_shinfo(skb)->frags[i - 1].size / 2;
2285 if (rem == 0)
2286 break;
2287
2288 skb_shinfo(skb)->frags[i - 1].size -= rem;
2289
2290 skb_shinfo(skb)->frags[i] = skb_shinfo(skb)->frags[i - 1];
2291 get_page(skb_shinfo(skb)->frags[i].page);
2292 skb_shinfo(skb)->frags[i].page = skb_shinfo(skb)->frags[i - 1].page;
2293 skb_shinfo(skb)->frags[i].page_offset += skb_shinfo(skb)->frags[i - 1].size;
2294 skb_shinfo(skb)->frags[i].size = rem;
2295 i++;
2296 skb_shinfo(skb)->nr_frags = i;
2297 }
2298 }
2299
2300 /* Stamp the time, and sequence number, convert them to network byte order */
2301 /* should we update cloned packets too ? */
2302 if (pgh) {
2303 struct timeval timestamp;
2304
2305 pgh->pgh_magic = htonl(PKTGEN_MAGIC);
2306 pgh->seq_num = htonl(pkt_dev->seq_num);
2307
2308 do_gettimeofday(&timestamp);
2309 pgh->tv_sec = htonl(timestamp.tv_sec);
2310 pgh->tv_usec = htonl(timestamp.tv_usec);
2311 }
2312 pkt_dev->seq_num++;
2313
2314 return skb;
2315}
2316
2317static inline struct sk_buff *fill_packet(struct net_device *odev,
2318 struct pktgen_dev *pkt_dev)
2319{
2320 if(pkt_dev->flags & F_IPV6)
2321 return fill_packet_ipv6(odev, pkt_dev);
2322 else
2323 return fill_packet_ipv4(odev, pkt_dev);
2324}
2325
2326static void pktgen_clear_counters(struct pktgen_dev *pkt_dev)
2327{
2328 pkt_dev->seq_num = 1;
2329 pkt_dev->idle_acc = 0;
2330 pkt_dev->sofar = 0;
2331 pkt_dev->tx_bytes = 0;
2332 pkt_dev->errors = 0;
2333}
2334
2335/* Set up structure for sending pkts, clear counters */
2336
2337static void pktgen_run(struct pktgen_thread *t)
2338{
2339 struct pktgen_dev *pkt_dev = NULL;
2340 int started = 0;
2341
2342 PG_DEBUG(printk("pktgen: entering pktgen_run. %p\n", t));
2343
2344 if_lock(t);
2345 for (pkt_dev = t->if_list; pkt_dev; pkt_dev = pkt_dev->next ) {
2346
2347 /*
2348 * setup odev and create initial packet.
2349 */
2350 pktgen_setup_inject(pkt_dev);
2351
2352 if(pkt_dev->odev) {
2353 pktgen_clear_counters(pkt_dev);
2354 pkt_dev->running = 1; /* Cranke yeself! */
2355 pkt_dev->skb = NULL;
2356 pkt_dev->started_at = getCurUs();
2357 pkt_dev->next_tx_us = getCurUs(); /* Transmit immediately */
2358 pkt_dev->next_tx_ns = 0;
2359
2360 strcpy(pkt_dev->result, "Starting");
2361 started++;
2362 }
2363 else
2364 strcpy(pkt_dev->result, "Error starting");
2365 }
2366 if_unlock(t);
2367 if(started) t->control &= ~(T_STOP);
2368}
2369
2370static void pktgen_stop_all_threads_ifs(void)
2371{
2372 struct pktgen_thread *t = pktgen_threads;
2373
2374 PG_DEBUG(printk("pktgen: entering pktgen_stop_all_threads.\n"));
2375
2376 thread_lock();
2377 while(t) {
2378 pktgen_stop(t);
2379 t = t->next;
2380 }
2381 thread_unlock();
2382}
2383
2384static int thread_is_running(struct pktgen_thread *t )
2385{
2386 struct pktgen_dev *next;
2387 int res = 0;
2388
2389 for(next=t->if_list; next; next=next->next) {
2390 if(next->running) {
2391 res = 1;
2392 break;
2393 }
2394 }
2395 return res;
2396}
2397
2398static int pktgen_wait_thread_run(struct pktgen_thread *t )
2399{
2400 if_lock(t);
2401
2402 while(thread_is_running(t)) {
2403
2404 if_unlock(t);
2405
2406 msleep_interruptible(100);
2407
2408 if (signal_pending(current))
2409 goto signal;
2410 if_lock(t);
2411 }
2412 if_unlock(t);
2413 return 1;
2414 signal:
2415 return 0;
2416}
2417
2418static int pktgen_wait_all_threads_run(void)
2419{
2420 struct pktgen_thread *t = pktgen_threads;
2421 int sig = 1;
2422
2423 while (t) {
2424 sig = pktgen_wait_thread_run(t);
2425 if( sig == 0 ) break;
2426 thread_lock();
2427 t=t->next;
2428 thread_unlock();
2429 }
2430 if(sig == 0) {
2431 thread_lock();
2432 while (t) {
2433 t->control |= (T_STOP);
2434 t=t->next;
2435 }
2436 thread_unlock();
2437 }
2438 return sig;
2439}
2440
2441static void pktgen_run_all_threads(void)
2442{
2443 struct pktgen_thread *t = pktgen_threads;
2444
2445 PG_DEBUG(printk("pktgen: entering pktgen_run_all_threads.\n"));
2446
2447 thread_lock();
2448
2449 while(t) {
2450 t->control |= (T_RUN);
2451 t = t->next;
2452 }
2453 thread_unlock();
2454
2455 current->state = TASK_INTERRUPTIBLE;
2456 schedule_timeout(HZ/8); /* Propagate thread->control */
2457
2458 pktgen_wait_all_threads_run();
2459}
2460
2461
2462static void show_results(struct pktgen_dev *pkt_dev, int nr_frags)
2463{
2464 __u64 total_us, bps, mbps, pps, idle;
2465 char *p = pkt_dev->result;
2466
2467 total_us = pkt_dev->stopped_at - pkt_dev->started_at;
2468
2469 idle = pkt_dev->idle_acc;
2470
2471 p += sprintf(p, "OK: %llu(c%llu+d%llu) usec, %llu (%dbyte,%dfrags)\n",
2472 (unsigned long long) total_us,
2473 (unsigned long long)(total_us - idle),
2474 (unsigned long long) idle,
2475 (unsigned long long) pkt_dev->sofar,
2476 pkt_dev->cur_pkt_size, nr_frags);
2477
2478 pps = pkt_dev->sofar * USEC_PER_SEC;
2479
2480 while ((total_us >> 32) != 0) {
2481 pps >>= 1;
2482 total_us >>= 1;
2483 }
2484
2485 do_div(pps, total_us);
2486
2487 bps = pps * 8 * pkt_dev->cur_pkt_size;
2488
2489 mbps = bps;
2490 do_div(mbps, 1000000);
2491 p += sprintf(p, " %llupps %lluMb/sec (%llubps) errors: %llu",
2492 (unsigned long long) pps,
2493 (unsigned long long) mbps,
2494 (unsigned long long) bps,
2495 (unsigned long long) pkt_dev->errors);
2496}
2497
2498
2499/* Set stopped-at timer, remove from running list, do counters & statistics */
2500
2501static int pktgen_stop_device(struct pktgen_dev *pkt_dev)
2502{
2503
2504 if (!pkt_dev->running) {
2505 printk("pktgen: interface: %s is already stopped\n", pkt_dev->ifname);
2506 return -EINVAL;
2507 }
2508
2509 pkt_dev->stopped_at = getCurUs();
2510 pkt_dev->running = 0;
2511
2512 show_results(pkt_dev, skb_shinfo(pkt_dev->skb)->nr_frags);
2513
2514 if (pkt_dev->skb)
2515 kfree_skb(pkt_dev->skb);
2516
2517 pkt_dev->skb = NULL;
2518
2519 return 0;
2520}
2521
2522static struct pktgen_dev *next_to_run(struct pktgen_thread *t )
2523{
2524 struct pktgen_dev *next, *best = NULL;
2525
2526 if_lock(t);
2527
2528 for(next=t->if_list; next ; next=next->next) {
2529 if(!next->running) continue;
2530 if(best == NULL) best=next;
2531 else if ( next->next_tx_us < best->next_tx_us)
2532 best = next;
2533 }
2534 if_unlock(t);
2535 return best;
2536}
2537
2538static void pktgen_stop(struct pktgen_thread *t) {
2539 struct pktgen_dev *next = NULL;
2540
2541 PG_DEBUG(printk("pktgen: entering pktgen_stop.\n"));
2542
2543 if_lock(t);
2544
2545 for(next=t->if_list; next; next=next->next)
2546 pktgen_stop_device(next);
2547
2548 if_unlock(t);
2549}
2550
2551static void pktgen_rem_all_ifs(struct pktgen_thread *t)
2552{
2553 struct pktgen_dev *cur, *next = NULL;
2554
2555 /* Remove all devices, free mem */
2556
2557 if_lock(t);
2558
2559 for(cur=t->if_list; cur; cur=next) {
2560 next = cur->next;
2561 pktgen_remove_device(t, cur);
2562 }
2563
2564 if_unlock(t);
2565}
2566
2567static void pktgen_rem_thread(struct pktgen_thread *t)
2568{
2569 /* Remove from the thread list */
2570
2571 struct pktgen_thread *tmp = pktgen_threads;
2572
2573 if (strlen(t->fname))
2574 remove_proc_entry(t->fname, NULL);
2575
2576 thread_lock();
2577
2578 if (tmp == t)
2579 pktgen_threads = tmp->next;
2580 else {
2581 while (tmp) {
2582 if (tmp->next == t) {
2583 tmp->next = t->next;
2584 t->next = NULL;
2585 break;
2586 }
2587 tmp = tmp->next;
2588 }
2589 }
2590 thread_unlock();
2591}
2592
2593static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev)
2594{
2595 struct net_device *odev = NULL;
2596 __u64 idle_start = 0;
2597 int ret;
2598
2599 odev = pkt_dev->odev;
2600
2601 if (pkt_dev->delay_us || pkt_dev->delay_ns) {
2602 u64 now;
2603
2604 now = getCurUs();
2605 if (now < pkt_dev->next_tx_us)
2606 spin(pkt_dev, pkt_dev->next_tx_us);
2607
2608 /* This is max DELAY, this has special meaning of
2609 * "never transmit"
2610 */
2611 if (pkt_dev->delay_us == 0x7FFFFFFF) {
2612 pkt_dev->next_tx_us = getCurUs() + pkt_dev->delay_us;
2613 pkt_dev->next_tx_ns = pkt_dev->delay_ns;
2614 goto out;
2615 }
2616 }
2617
2618 if (netif_queue_stopped(odev) || need_resched()) {
2619 idle_start = getCurUs();
2620
2621 if (!netif_running(odev)) {
2622 pktgen_stop_device(pkt_dev);
2623 goto out;
2624 }
2625 if (need_resched())
2626 schedule();
2627
2628 pkt_dev->idle_acc += getCurUs() - idle_start;
2629
2630 if (netif_queue_stopped(odev)) {
2631 pkt_dev->next_tx_us = getCurUs(); /* TODO */
2632 pkt_dev->next_tx_ns = 0;
2633 goto out; /* Try the next interface */
2634 }
2635 }
2636
2637 if (pkt_dev->last_ok || !pkt_dev->skb) {
2638 if ((++pkt_dev->clone_count >= pkt_dev->clone_skb ) || (!pkt_dev->skb)) {
2639 /* build a new pkt */
2640 if (pkt_dev->skb)
2641 kfree_skb(pkt_dev->skb);
2642
2643 pkt_dev->skb = fill_packet(odev, pkt_dev);
2644 if (pkt_dev->skb == NULL) {
2645 printk("pktgen: ERROR: couldn't allocate skb in fill_packet.\n");
2646 schedule();
2647 pkt_dev->clone_count--; /* back out increment, OOM */
2648 goto out;
2649 }
2650 pkt_dev->allocated_skbs++;
2651 pkt_dev->clone_count = 0; /* reset counter */
2652 }
2653 }
2654
2655 spin_lock_bh(&odev->xmit_lock);
2656 if (!netif_queue_stopped(odev)) {
2657
2658 atomic_inc(&(pkt_dev->skb->users));
2659retry_now:
2660 ret = odev->hard_start_xmit(pkt_dev->skb, odev);
2661 if (likely(ret == NETDEV_TX_OK)) {
2662 pkt_dev->last_ok = 1;
2663 pkt_dev->sofar++;
2664 pkt_dev->seq_num++;
2665 pkt_dev->tx_bytes += pkt_dev->cur_pkt_size;
2666
2667 } else if (ret == NETDEV_TX_LOCKED
2668 && (odev->features & NETIF_F_LLTX)) {
2669 cpu_relax();
2670 goto retry_now;
2671 } else { /* Retry it next time */
2672
2673 atomic_dec(&(pkt_dev->skb->users));
2674
2675 if (debug && net_ratelimit())
2676 printk(KERN_INFO "pktgen: Hard xmit error\n");
2677
2678 pkt_dev->errors++;
2679 pkt_dev->last_ok = 0;
2680 }
2681
2682 pkt_dev->next_tx_us = getCurUs();
2683 pkt_dev->next_tx_ns = 0;
2684
2685 pkt_dev->next_tx_us += pkt_dev->delay_us;
2686 pkt_dev->next_tx_ns += pkt_dev->delay_ns;
2687
2688 if (pkt_dev->next_tx_ns > 1000) {
2689 pkt_dev->next_tx_us++;
2690 pkt_dev->next_tx_ns -= 1000;
2691 }
2692 }
2693
2694 else { /* Retry it next time */
2695 pkt_dev->last_ok = 0;
2696 pkt_dev->next_tx_us = getCurUs(); /* TODO */
2697 pkt_dev->next_tx_ns = 0;
2698 }
2699
2700 spin_unlock_bh(&odev->xmit_lock);
2701
2702 /* If pkt_dev->count is zero, then run forever */
2703 if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) {
2704 if (atomic_read(&(pkt_dev->skb->users)) != 1) {
2705 idle_start = getCurUs();
2706 while (atomic_read(&(pkt_dev->skb->users)) != 1) {
2707 if (signal_pending(current)) {
2708 break;
2709 }
2710 schedule();
2711 }
2712 pkt_dev->idle_acc += getCurUs() - idle_start;
2713 }
2714
2715 /* Done with this */
2716 pktgen_stop_device(pkt_dev);
2717 }
2718 out:;
2719 }
2720
2721/*
2722 * Main loop of the thread goes here
2723 */
2724
2725static void pktgen_thread_worker(struct pktgen_thread *t)
2726{
2727 DEFINE_WAIT(wait);
2728 struct pktgen_dev *pkt_dev = NULL;
2729 int cpu = t->cpu;
2730 sigset_t tmpsig;
2731 u32 max_before_softirq;
2732 u32 tx_since_softirq = 0;
2733
2734 daemonize("pktgen/%d", cpu);
2735
2736 /* Block all signals except SIGKILL, SIGSTOP and SIGTERM */
2737
2738 spin_lock_irq(&current->sighand->siglock);
2739 tmpsig = current->blocked;
2740 siginitsetinv(&current->blocked,
2741 sigmask(SIGKILL) |
2742 sigmask(SIGSTOP)|
2743 sigmask(SIGTERM));
2744
2745 recalc_sigpending();
2746 spin_unlock_irq(&current->sighand->siglock);
2747
2748 /* Migrate to the right CPU */
2749 set_cpus_allowed(current, cpumask_of_cpu(cpu));
2750 if (smp_processor_id() != cpu)
2751 BUG();
2752
2753 init_waitqueue_head(&t->queue);
2754
2755 t->control &= ~(T_TERMINATE);
2756 t->control &= ~(T_RUN);
2757 t->control &= ~(T_STOP);
2758 t->control &= ~(T_REMDEV);
2759
2760 t->pid = current->pid;
2761
2762 PG_DEBUG(printk("pktgen: starting pktgen/%d: pid=%d\n", cpu, current->pid));
2763
2764 max_before_softirq = t->max_before_softirq;
2765
2766 __set_current_state(TASK_INTERRUPTIBLE);
2767 mb();
2768
2769 while (1) {
2770
2771 __set_current_state(TASK_RUNNING);
2772
2773 /*
2774 * Get next dev to xmit -- if any.
2775 */
2776
2777 pkt_dev = next_to_run(t);
2778
2779 if (pkt_dev) {
2780
2781 pktgen_xmit(pkt_dev);
2782
2783 /*
2784 * We like to stay RUNNING but must also give
2785 * others fair share.
2786 */
2787
2788 tx_since_softirq += pkt_dev->last_ok;
2789
2790 if (tx_since_softirq > max_before_softirq) {
2791 if (local_softirq_pending())
2792 do_softirq();
2793 tx_since_softirq = 0;
2794 }
2795 } else {
2796 prepare_to_wait(&(t->queue), &wait, TASK_INTERRUPTIBLE);
2797 schedule_timeout(HZ/10);
2798 finish_wait(&(t->queue), &wait);
2799 }
2800
2801 /*
2802 * Back from sleep, either due to the timeout or signal.
2803 * We check if we have any "posted" work for us.
2804 */
2805
2806 if (t->control & T_TERMINATE || signal_pending(current))
2807 /* we received a request to terminate ourself */
2808 break;
2809
2810
2811 if(t->control & T_STOP) {
2812 pktgen_stop(t);
2813 t->control &= ~(T_STOP);
2814 }
2815
2816 if(t->control & T_RUN) {
2817 pktgen_run(t);
2818 t->control &= ~(T_RUN);
2819 }
2820
2821 if(t->control & T_REMDEV) {
2822 pktgen_rem_all_ifs(t);
2823 t->control &= ~(T_REMDEV);
2824 }
2825
2826 if (need_resched())
2827 schedule();
2828 }
2829
2830 PG_DEBUG(printk("pktgen: %s stopping all device\n", t->name));
2831 pktgen_stop(t);
2832
2833 PG_DEBUG(printk("pktgen: %s removing all device\n", t->name));
2834 pktgen_rem_all_ifs(t);
2835
2836 PG_DEBUG(printk("pktgen: %s removing thread.\n", t->name));
2837 pktgen_rem_thread(t);
2838}
2839
2840static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, const char* ifname)
2841{
2842 struct pktgen_dev *pkt_dev = NULL;
2843 if_lock(t);
2844
2845 for(pkt_dev=t->if_list; pkt_dev; pkt_dev = pkt_dev->next ) {
2846 if (strcmp(pkt_dev->ifname, ifname) == 0) {
2847 break;
2848 }
2849 }
2850
2851 if_unlock(t);
2852 PG_DEBUG(printk("pktgen: find_dev(%s) returning %p\n", ifname,pkt_dev));
2853 return pkt_dev;
2854}
2855
2856/*
2857 * Adds a dev at front of if_list.
2858 */
2859
2860static int add_dev_to_thread(struct pktgen_thread *t, struct pktgen_dev *pkt_dev)
2861{
2862 int rv = 0;
2863
2864 if_lock(t);
2865
2866 if (pkt_dev->pg_thread) {
2867 printk("pktgen: ERROR: already assigned to a thread.\n");
2868 rv = -EBUSY;
2869 goto out;
2870 }
2871 pkt_dev->next =t->if_list; t->if_list=pkt_dev;
2872 pkt_dev->pg_thread = t;
2873 pkt_dev->running = 0;
2874
2875 out:
2876 if_unlock(t);
2877 return rv;
2878}
2879
2880/* Called under thread lock */
2881
2882static int pktgen_add_device(struct pktgen_thread *t, const char* ifname)
2883{
2884 struct pktgen_dev *pkt_dev;
2885
2886 /* We don't allow a device to be on several threads */
2887
2888 if( (pkt_dev = __pktgen_NN_threads(ifname, FIND)) == NULL) {
2889
2890 pkt_dev = kmalloc(sizeof(struct pktgen_dev), GFP_KERNEL);
2891 if (!pkt_dev)
2892 return -ENOMEM;
2893
2894 memset(pkt_dev, 0, sizeof(struct pktgen_dev));
2895
2896 pkt_dev->flows = vmalloc(MAX_CFLOWS*sizeof(struct flow_state));
2897 if (pkt_dev->flows == NULL) {
2898 kfree(pkt_dev);
2899 return -ENOMEM;
2900 }
2901 memset(pkt_dev->flows, 0, MAX_CFLOWS*sizeof(struct flow_state));
2902
2903 pkt_dev->min_pkt_size = ETH_ZLEN;
2904 pkt_dev->max_pkt_size = ETH_ZLEN;
2905 pkt_dev->nfrags = 0;
2906 pkt_dev->clone_skb = pg_clone_skb_d;
2907 pkt_dev->delay_us = pg_delay_d / 1000;
2908 pkt_dev->delay_ns = pg_delay_d % 1000;
2909 pkt_dev->count = pg_count_d;
2910 pkt_dev->sofar = 0;
2911 pkt_dev->udp_src_min = 9; /* sink port */
2912 pkt_dev->udp_src_max = 9;
2913 pkt_dev->udp_dst_min = 9;
2914 pkt_dev->udp_dst_max = 9;
2915
2916 strncpy(pkt_dev->ifname, ifname, 31);
2917 sprintf(pkt_dev->fname, "net/%s/%s", PG_PROC_DIR, ifname);
2918
2919 if (! pktgen_setup_dev(pkt_dev)) {
2920 printk("pktgen: ERROR: pktgen_setup_dev failed.\n");
2921 if (pkt_dev->flows)
2922 vfree(pkt_dev->flows);
2923 kfree(pkt_dev);
2924 return -ENODEV;
2925 }
2926
2927 pkt_dev->proc_ent = create_proc_entry(pkt_dev->fname, 0600, NULL);
2928 if (!pkt_dev->proc_ent) {
2929 printk("pktgen: cannot create %s procfs entry.\n", pkt_dev->fname);
2930 if (pkt_dev->flows)
2931 vfree(pkt_dev->flows);
2932 kfree(pkt_dev);
2933 return -EINVAL;
2934 }
2935 pkt_dev->proc_ent->read_proc = proc_if_read;
2936 pkt_dev->proc_ent->write_proc = proc_if_write;
2937 pkt_dev->proc_ent->data = (void*)(pkt_dev);
2938 pkt_dev->proc_ent->owner = THIS_MODULE;
2939
2940 return add_dev_to_thread(t, pkt_dev);
2941 }
2942 else {
2943 printk("pktgen: ERROR: interface already used.\n");
2944 return -EBUSY;
2945 }
2946}
2947
2948static struct pktgen_thread *pktgen_find_thread(const char* name)
2949{
2950 struct pktgen_thread *t = NULL;
2951
2952 thread_lock();
2953
2954 t = pktgen_threads;
2955 while (t) {
2956 if (strcmp(t->name, name) == 0)
2957 break;
2958
2959 t = t->next;
2960 }
2961 thread_unlock();
2962 return t;
2963}
2964
2965static int pktgen_create_thread(const char* name, int cpu)
2966{
2967 struct pktgen_thread *t = NULL;
2968
2969 if (strlen(name) > 31) {
2970 printk("pktgen: ERROR: Thread name cannot be more than 31 characters.\n");
2971 return -EINVAL;
2972 }
2973
2974 if (pktgen_find_thread(name)) {
2975 printk("pktgen: ERROR: thread: %s already exists\n", name);
2976 return -EINVAL;
2977 }
2978
2979 t = (struct pktgen_thread*)(kmalloc(sizeof(struct pktgen_thread), GFP_KERNEL));
2980 if (!t) {
2981 printk("pktgen: ERROR: out of memory, can't create new thread.\n");
2982 return -ENOMEM;
2983 }
2984
2985 memset(t, 0, sizeof(struct pktgen_thread));
2986 strcpy(t->name, name);
2987 spin_lock_init(&t->if_lock);
2988 t->cpu = cpu;
2989
2990 sprintf(t->fname, "net/%s/%s", PG_PROC_DIR, t->name);
2991 t->proc_ent = create_proc_entry(t->fname, 0600, NULL);
2992 if (!t->proc_ent) {
2993 printk("pktgen: cannot create %s procfs entry.\n", t->fname);
2994 kfree(t);
2995 return -EINVAL;
2996 }
2997 t->proc_ent->read_proc = proc_thread_read;
2998 t->proc_ent->write_proc = proc_thread_write;
2999 t->proc_ent->data = (void*)(t);
3000 t->proc_ent->owner = THIS_MODULE;
3001
3002 t->next = pktgen_threads;
3003 pktgen_threads = t;
3004
3005 if (kernel_thread((void *) pktgen_thread_worker, (void *) t,
3006 CLONE_FS | CLONE_FILES | CLONE_SIGHAND) < 0)
3007 printk("pktgen: kernel_thread() failed for cpu %d\n", t->cpu);
3008
3009 return 0;
3010}
3011
3012/*
3013 * Removes a device from the thread if_list.
3014 */
3015static void _rem_dev_from_if_list(struct pktgen_thread *t, struct pktgen_dev *pkt_dev)
3016{
3017 struct pktgen_dev *i, *prev = NULL;
3018
3019 i = t->if_list;
3020
3021 while(i) {
3022 if(i == pkt_dev) {
3023 if(prev) prev->next = i->next;
3024 else t->if_list = NULL;
3025 break;
3026 }
3027 prev = i;
3028 i=i->next;
3029 }
3030}
3031
3032static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *pkt_dev)
3033{
3034
3035 PG_DEBUG(printk("pktgen: remove_device pkt_dev=%p\n", pkt_dev));
3036
3037 if (pkt_dev->running) {
3038 printk("pktgen:WARNING: trying to remove a running interface, stopping it now.\n");
3039 pktgen_stop_device(pkt_dev);
3040 }
3041
3042 /* Dis-associate from the interface */
3043
3044 if (pkt_dev->odev) {
3045 dev_put(pkt_dev->odev);
3046 pkt_dev->odev = NULL;
3047 }
3048
3049 /* And update the thread if_list */
3050
3051 _rem_dev_from_if_list(t, pkt_dev);
3052
3053 /* Clean up proc file system */
3054
3055 if (strlen(pkt_dev->fname))
3056 remove_proc_entry(pkt_dev->fname, NULL);
3057
3058 if (pkt_dev->flows)
3059 vfree(pkt_dev->flows);
3060 kfree(pkt_dev);
3061 return 0;
3062}
3063
3064static int __init pg_init(void)
3065{
3066 int cpu;
3067 printk(version);
3068
3069 module_fname[0] = 0;
3070
3071 create_proc_dir();
3072
3073 sprintf(module_fname, "net/%s/pgctrl", PG_PROC_DIR);
3074 module_proc_ent = create_proc_entry(module_fname, 0600, NULL);
3075 if (!module_proc_ent) {
3076 printk("pktgen: ERROR: cannot create %s procfs entry.\n", module_fname);
3077 return -EINVAL;
3078 }
3079
3080 module_proc_ent->proc_fops = &pktgen_fops;
3081 module_proc_ent->data = NULL;
3082
3083 /* Register us to receive netdevice events */
3084 register_netdevice_notifier(&pktgen_notifier_block);
3085
3086 for (cpu = 0; cpu < NR_CPUS ; cpu++) {
3087 char buf[30];
3088
3089 if (!cpu_online(cpu))
3090 continue;
3091
3092 sprintf(buf, "kpktgend_%i", cpu);
3093 pktgen_create_thread(buf, cpu);
3094 }
3095 return 0;
3096}
3097
3098static void __exit pg_cleanup(void)
3099{
3100 wait_queue_head_t queue;
3101 init_waitqueue_head(&queue);
3102
3103 /* Stop all interfaces & threads */
3104
3105 while (pktgen_threads) {
3106 struct pktgen_thread *t = pktgen_threads;
3107 pktgen_threads->control |= (T_TERMINATE);
3108
3109 wait_event_interruptible_timeout(queue, (t != pktgen_threads), HZ);
3110 }
3111
3112 /* Un-register us from receiving netdevice events */
3113 unregister_netdevice_notifier(&pktgen_notifier_block);
3114
3115 /* Clean up proc file system */
3116
3117 remove_proc_entry(module_fname, NULL);
3118
3119 remove_proc_dir();
3120}
3121
3122
3123module_init(pg_init);
3124module_exit(pg_cleanup);
3125
3126MODULE_AUTHOR("Robert Olsson <robert.olsson@its.uu.se");
3127MODULE_DESCRIPTION("Packet Generator tool");
3128MODULE_LICENSE("GPL");
3129module_param(pg_count_d, int, 0);
3130module_param(pg_delay_d, int, 0);
3131module_param(pg_clone_skb_d, int, 0);
3132module_param(debug, int, 0);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
new file mode 100644
index 000000000000..d69ad90e5811
--- /dev/null
+++ b/net/core/rtnetlink.c
@@ -0,0 +1,711 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Routing netlink socket interface: protocol independent part.
7 *
8 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 *
15 * Fixes:
16 * Vitaly E. Lavrov RTA_OK arithmetics was wrong.
17 */
18
19#include <linux/config.h>
20#include <linux/errno.h>
21#include <linux/module.h>
22#include <linux/types.h>
23#include <linux/socket.h>
24#include <linux/kernel.h>
25#include <linux/major.h>
26#include <linux/sched.h>
27#include <linux/timer.h>
28#include <linux/string.h>
29#include <linux/sockios.h>
30#include <linux/net.h>
31#include <linux/fcntl.h>
32#include <linux/mm.h>
33#include <linux/slab.h>
34#include <linux/interrupt.h>
35#include <linux/capability.h>
36#include <linux/skbuff.h>
37#include <linux/init.h>
38#include <linux/security.h>
39
40#include <asm/uaccess.h>
41#include <asm/system.h>
42#include <asm/string.h>
43
44#include <linux/inet.h>
45#include <linux/netdevice.h>
46#include <net/ip.h>
47#include <net/protocol.h>
48#include <net/arp.h>
49#include <net/route.h>
50#include <net/udp.h>
51#include <net/sock.h>
52#include <net/pkt_sched.h>
53
54DECLARE_MUTEX(rtnl_sem);
55
56void rtnl_lock(void)
57{
58 rtnl_shlock();
59}
60
61int rtnl_lock_interruptible(void)
62{
63 return down_interruptible(&rtnl_sem);
64}
65
66void rtnl_unlock(void)
67{
68 rtnl_shunlock();
69
70 netdev_run_todo();
71}
72
73int rtattr_parse(struct rtattr *tb[], int maxattr, struct rtattr *rta, int len)
74{
75 memset(tb, 0, sizeof(struct rtattr*)*maxattr);
76
77 while (RTA_OK(rta, len)) {
78 unsigned flavor = rta->rta_type;
79 if (flavor && flavor <= maxattr)
80 tb[flavor-1] = rta;
81 rta = RTA_NEXT(rta, len);
82 }
83 return 0;
84}
85
86struct sock *rtnl;
87
88struct rtnetlink_link * rtnetlink_links[NPROTO];
89
90static const int rtm_min[(RTM_MAX+1-RTM_BASE)/4] =
91{
92 NLMSG_LENGTH(sizeof(struct ifinfomsg)),
93 NLMSG_LENGTH(sizeof(struct ifaddrmsg)),
94 NLMSG_LENGTH(sizeof(struct rtmsg)),
95 NLMSG_LENGTH(sizeof(struct ndmsg)),
96 NLMSG_LENGTH(sizeof(struct rtmsg)),
97 NLMSG_LENGTH(sizeof(struct tcmsg)),
98 NLMSG_LENGTH(sizeof(struct tcmsg)),
99 NLMSG_LENGTH(sizeof(struct tcmsg)),
100 NLMSG_LENGTH(sizeof(struct tcamsg))
101};
102
103static const int rta_max[(RTM_MAX+1-RTM_BASE)/4] =
104{
105 IFLA_MAX,
106 IFA_MAX,
107 RTA_MAX,
108 NDA_MAX,
109 RTA_MAX,
110 TCA_MAX,
111 TCA_MAX,
112 TCA_MAX,
113 TCAA_MAX
114};
115
116void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data)
117{
118 struct rtattr *rta;
119 int size = RTA_LENGTH(attrlen);
120
121 rta = (struct rtattr*)skb_put(skb, RTA_ALIGN(size));
122 rta->rta_type = attrtype;
123 rta->rta_len = size;
124 memcpy(RTA_DATA(rta), data, attrlen);
125}
126
127size_t rtattr_strlcpy(char *dest, const struct rtattr *rta, size_t size)
128{
129 size_t ret = RTA_PAYLOAD(rta);
130 char *src = RTA_DATA(rta);
131
132 if (ret > 0 && src[ret - 1] == '\0')
133 ret--;
134 if (size > 0) {
135 size_t len = (ret >= size) ? size - 1 : ret;
136 memset(dest, 0, size);
137 memcpy(dest, src, len);
138 }
139 return ret;
140}
141
142int rtnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo)
143{
144 int err = 0;
145
146 NETLINK_CB(skb).dst_groups = group;
147 if (echo)
148 atomic_inc(&skb->users);
149 netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL);
150 if (echo)
151 err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
152 return err;
153}
154
155int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)
156{
157 struct rtattr *mx = (struct rtattr*)skb->tail;
158 int i;
159
160 RTA_PUT(skb, RTA_METRICS, 0, NULL);
161 for (i=0; i<RTAX_MAX; i++) {
162 if (metrics[i])
163 RTA_PUT(skb, i+1, sizeof(u32), metrics+i);
164 }
165 mx->rta_len = skb->tail - (u8*)mx;
166 if (mx->rta_len == RTA_LENGTH(0))
167 skb_trim(skb, (u8*)mx - skb->data);
168 return 0;
169
170rtattr_failure:
171 skb_trim(skb, (u8*)mx - skb->data);
172 return -1;
173}
174
175
176static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
177 int type, u32 pid, u32 seq, u32 change)
178{
179 struct ifinfomsg *r;
180 struct nlmsghdr *nlh;
181 unsigned char *b = skb->tail;
182
183 nlh = NLMSG_PUT(skb, pid, seq, type, sizeof(*r));
184 if (pid) nlh->nlmsg_flags |= NLM_F_MULTI;
185 r = NLMSG_DATA(nlh);
186 r->ifi_family = AF_UNSPEC;
187 r->ifi_type = dev->type;
188 r->ifi_index = dev->ifindex;
189 r->ifi_flags = dev_get_flags(dev);
190 r->ifi_change = change;
191
192 RTA_PUT(skb, IFLA_IFNAME, strlen(dev->name)+1, dev->name);
193
194 if (1) {
195 u32 txqlen = dev->tx_queue_len;
196 RTA_PUT(skb, IFLA_TXQLEN, sizeof(txqlen), &txqlen);
197 }
198
199 if (1) {
200 u32 weight = dev->weight;
201 RTA_PUT(skb, IFLA_WEIGHT, sizeof(weight), &weight);
202 }
203
204 if (1) {
205 struct rtnl_link_ifmap map = {
206 .mem_start = dev->mem_start,
207 .mem_end = dev->mem_end,
208 .base_addr = dev->base_addr,
209 .irq = dev->irq,
210 .dma = dev->dma,
211 .port = dev->if_port,
212 };
213 RTA_PUT(skb, IFLA_MAP, sizeof(map), &map);
214 }
215
216 if (dev->addr_len) {
217 RTA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr);
218 RTA_PUT(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast);
219 }
220
221 if (1) {
222 u32 mtu = dev->mtu;
223 RTA_PUT(skb, IFLA_MTU, sizeof(mtu), &mtu);
224 }
225
226 if (dev->ifindex != dev->iflink) {
227 u32 iflink = dev->iflink;
228 RTA_PUT(skb, IFLA_LINK, sizeof(iflink), &iflink);
229 }
230
231 if (dev->qdisc_sleeping)
232 RTA_PUT(skb, IFLA_QDISC,
233 strlen(dev->qdisc_sleeping->ops->id) + 1,
234 dev->qdisc_sleeping->ops->id);
235
236 if (dev->master) {
237 u32 master = dev->master->ifindex;
238 RTA_PUT(skb, IFLA_MASTER, sizeof(master), &master);
239 }
240
241 if (dev->get_stats) {
242 unsigned long *stats = (unsigned long*)dev->get_stats(dev);
243 if (stats) {
244 struct rtattr *a;
245 __u32 *s;
246 int i;
247 int n = sizeof(struct rtnl_link_stats)/4;
248
249 a = __RTA_PUT(skb, IFLA_STATS, n*4);
250 s = RTA_DATA(a);
251 for (i=0; i<n; i++)
252 s[i] = stats[i];
253 }
254 }
255 nlh->nlmsg_len = skb->tail - b;
256 return skb->len;
257
258nlmsg_failure:
259rtattr_failure:
260 skb_trim(skb, b - skb->data);
261 return -1;
262}
263
264static int rtnetlink_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
265{
266 int idx;
267 int s_idx = cb->args[0];
268 struct net_device *dev;
269
270 read_lock(&dev_base_lock);
271 for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
272 if (idx < s_idx)
273 continue;
274 if (rtnetlink_fill_ifinfo(skb, dev, RTM_NEWLINK, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, 0) <= 0)
275 break;
276 }
277 read_unlock(&dev_base_lock);
278 cb->args[0] = idx;
279
280 return skb->len;
281}
282
283static int do_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
284{
285 struct ifinfomsg *ifm = NLMSG_DATA(nlh);
286 struct rtattr **ida = arg;
287 struct net_device *dev;
288 int err, send_addr_notify = 0;
289
290 if (ifm->ifi_index >= 0)
291 dev = dev_get_by_index(ifm->ifi_index);
292 else if (ida[IFLA_IFNAME - 1]) {
293 char ifname[IFNAMSIZ];
294
295 if (rtattr_strlcpy(ifname, ida[IFLA_IFNAME - 1],
296 IFNAMSIZ) >= IFNAMSIZ)
297 return -EINVAL;
298 dev = dev_get_by_name(ifname);
299 } else
300 return -EINVAL;
301
302 if (!dev)
303 return -ENODEV;
304
305 err = -EINVAL;
306
307 if (ifm->ifi_flags)
308 dev_change_flags(dev, ifm->ifi_flags);
309
310 if (ida[IFLA_MAP - 1]) {
311 struct rtnl_link_ifmap *u_map;
312 struct ifmap k_map;
313
314 if (!dev->set_config) {
315 err = -EOPNOTSUPP;
316 goto out;
317 }
318
319 if (!netif_device_present(dev)) {
320 err = -ENODEV;
321 goto out;
322 }
323
324 if (ida[IFLA_MAP - 1]->rta_len != RTA_LENGTH(sizeof(*u_map)))
325 goto out;
326
327 u_map = RTA_DATA(ida[IFLA_MAP - 1]);
328
329 k_map.mem_start = (unsigned long) u_map->mem_start;
330 k_map.mem_end = (unsigned long) u_map->mem_end;
331 k_map.base_addr = (unsigned short) u_map->base_addr;
332 k_map.irq = (unsigned char) u_map->irq;
333 k_map.dma = (unsigned char) u_map->dma;
334 k_map.port = (unsigned char) u_map->port;
335
336 err = dev->set_config(dev, &k_map);
337
338 if (err)
339 goto out;
340 }
341
342 if (ida[IFLA_ADDRESS - 1]) {
343 if (!dev->set_mac_address) {
344 err = -EOPNOTSUPP;
345 goto out;
346 }
347 if (!netif_device_present(dev)) {
348 err = -ENODEV;
349 goto out;
350 }
351 if (ida[IFLA_ADDRESS - 1]->rta_len != RTA_LENGTH(dev->addr_len))
352 goto out;
353
354 err = dev->set_mac_address(dev, RTA_DATA(ida[IFLA_ADDRESS - 1]));
355 if (err)
356 goto out;
357 send_addr_notify = 1;
358 }
359
360 if (ida[IFLA_BROADCAST - 1]) {
361 if (ida[IFLA_BROADCAST - 1]->rta_len != RTA_LENGTH(dev->addr_len))
362 goto out;
363 memcpy(dev->broadcast, RTA_DATA(ida[IFLA_BROADCAST - 1]),
364 dev->addr_len);
365 send_addr_notify = 1;
366 }
367
368 if (ida[IFLA_MTU - 1]) {
369 if (ida[IFLA_MTU - 1]->rta_len != RTA_LENGTH(sizeof(u32)))
370 goto out;
371 err = dev_set_mtu(dev, *((u32 *) RTA_DATA(ida[IFLA_MTU - 1])));
372
373 if (err)
374 goto out;
375
376 }
377
378 if (ida[IFLA_TXQLEN - 1]) {
379 if (ida[IFLA_TXQLEN - 1]->rta_len != RTA_LENGTH(sizeof(u32)))
380 goto out;
381
382 dev->tx_queue_len = *((u32 *) RTA_DATA(ida[IFLA_TXQLEN - 1]));
383 }
384
385 if (ida[IFLA_WEIGHT - 1]) {
386 if (ida[IFLA_WEIGHT - 1]->rta_len != RTA_LENGTH(sizeof(u32)))
387 goto out;
388
389 dev->weight = *((u32 *) RTA_DATA(ida[IFLA_WEIGHT - 1]));
390 }
391
392 if (ifm->ifi_index >= 0 && ida[IFLA_IFNAME - 1]) {
393 char ifname[IFNAMSIZ];
394
395 if (rtattr_strlcpy(ifname, ida[IFLA_IFNAME - 1],
396 IFNAMSIZ) >= IFNAMSIZ)
397 goto out;
398 err = dev_change_name(dev, ifname);
399 if (err)
400 goto out;
401 }
402
403 err = 0;
404
405out:
406 if (send_addr_notify)
407 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
408
409 dev_put(dev);
410 return err;
411}
412
413static int rtnetlink_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
414{
415 int idx;
416 int s_idx = cb->family;
417
418 if (s_idx == 0)
419 s_idx = 1;
420 for (idx=1; idx<NPROTO; idx++) {
421 int type = cb->nlh->nlmsg_type-RTM_BASE;
422 if (idx < s_idx || idx == PF_PACKET)
423 continue;
424 if (rtnetlink_links[idx] == NULL ||
425 rtnetlink_links[idx][type].dumpit == NULL)
426 continue;
427 if (idx > s_idx)
428 memset(&cb->args[0], 0, sizeof(cb->args));
429 if (rtnetlink_links[idx][type].dumpit(skb, cb))
430 break;
431 }
432 cb->family = idx;
433
434 return skb->len;
435}
436
437void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change)
438{
439 struct sk_buff *skb;
440 int size = NLMSG_SPACE(sizeof(struct ifinfomsg) +
441 sizeof(struct rtnl_link_ifmap) +
442 sizeof(struct rtnl_link_stats) + 128);
443
444 skb = alloc_skb(size, GFP_KERNEL);
445 if (!skb)
446 return;
447
448 if (rtnetlink_fill_ifinfo(skb, dev, type, 0, 0, change) < 0) {
449 kfree_skb(skb);
450 return;
451 }
452 NETLINK_CB(skb).dst_groups = RTMGRP_LINK;
453 netlink_broadcast(rtnl, skb, 0, RTMGRP_LINK, GFP_KERNEL);
454}
455
456static int rtnetlink_done(struct netlink_callback *cb)
457{
458 return 0;
459}
460
461/* Protected by RTNL sempahore. */
462static struct rtattr **rta_buf;
463static int rtattr_max;
464
465/* Process one rtnetlink message. */
466
467static __inline__ int
468rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp)
469{
470 struct rtnetlink_link *link;
471 struct rtnetlink_link *link_tab;
472 int sz_idx, kind;
473 int min_len;
474 int family;
475 int type;
476 int err;
477
478 /* Only requests are handled by kernel now */
479 if (!(nlh->nlmsg_flags&NLM_F_REQUEST))
480 return 0;
481
482 type = nlh->nlmsg_type;
483
484 /* A control message: ignore them */
485 if (type < RTM_BASE)
486 return 0;
487
488 /* Unknown message: reply with EINVAL */
489 if (type > RTM_MAX)
490 goto err_inval;
491
492 type -= RTM_BASE;
493
494 /* All the messages must have at least 1 byte length */
495 if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct rtgenmsg)))
496 return 0;
497
498 family = ((struct rtgenmsg*)NLMSG_DATA(nlh))->rtgen_family;
499 if (family >= NPROTO) {
500 *errp = -EAFNOSUPPORT;
501 return -1;
502 }
503
504 link_tab = rtnetlink_links[family];
505 if (link_tab == NULL)
506 link_tab = rtnetlink_links[PF_UNSPEC];
507 link = &link_tab[type];
508
509 sz_idx = type>>2;
510 kind = type&3;
511
512 if (kind != 2 && security_netlink_recv(skb)) {
513 *errp = -EPERM;
514 return -1;
515 }
516
517 if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
518 u32 rlen;
519
520 if (link->dumpit == NULL)
521 link = &(rtnetlink_links[PF_UNSPEC][type]);
522
523 if (link->dumpit == NULL)
524 goto err_inval;
525
526 if ((*errp = netlink_dump_start(rtnl, skb, nlh,
527 link->dumpit,
528 rtnetlink_done)) != 0) {
529 return -1;
530 }
531 rlen = NLMSG_ALIGN(nlh->nlmsg_len);
532 if (rlen > skb->len)
533 rlen = skb->len;
534 skb_pull(skb, rlen);
535 return -1;
536 }
537
538 memset(rta_buf, 0, (rtattr_max * sizeof(struct rtattr *)));
539
540 min_len = rtm_min[sz_idx];
541 if (nlh->nlmsg_len < min_len)
542 goto err_inval;
543
544 if (nlh->nlmsg_len > min_len) {
545 int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len);
546 struct rtattr *attr = (void*)nlh + NLMSG_ALIGN(min_len);
547
548 while (RTA_OK(attr, attrlen)) {
549 unsigned flavor = attr->rta_type;
550 if (flavor) {
551 if (flavor > rta_max[sz_idx])
552 goto err_inval;
553 rta_buf[flavor-1] = attr;
554 }
555 attr = RTA_NEXT(attr, attrlen);
556 }
557 }
558
559 if (link->doit == NULL)
560 link = &(rtnetlink_links[PF_UNSPEC][type]);
561 if (link->doit == NULL)
562 goto err_inval;
563 err = link->doit(skb, nlh, (void *)&rta_buf[0]);
564
565 *errp = err;
566 return err;
567
568err_inval:
569 *errp = -EINVAL;
570 return -1;
571}
572
573/*
574 * Process one packet of messages.
575 * Malformed skbs with wrong lengths of messages are discarded silently.
576 */
577
578static inline int rtnetlink_rcv_skb(struct sk_buff *skb)
579{
580 int err;
581 struct nlmsghdr * nlh;
582
583 while (skb->len >= NLMSG_SPACE(0)) {
584 u32 rlen;
585
586 nlh = (struct nlmsghdr *)skb->data;
587 if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
588 return 0;
589 rlen = NLMSG_ALIGN(nlh->nlmsg_len);
590 if (rlen > skb->len)
591 rlen = skb->len;
592 if (rtnetlink_rcv_msg(skb, nlh, &err)) {
593 /* Not error, but we must interrupt processing here:
594 * Note, that in this case we do not pull message
595 * from skb, it will be processed later.
596 */
597 if (err == 0)
598 return -1;
599 netlink_ack(skb, nlh, err);
600 } else if (nlh->nlmsg_flags&NLM_F_ACK)
601 netlink_ack(skb, nlh, 0);
602 skb_pull(skb, rlen);
603 }
604
605 return 0;
606}
607
608/*
609 * rtnetlink input queue processing routine:
610 * - try to acquire shared lock. If it is failed, defer processing.
611 * - feed skbs to rtnetlink_rcv_skb, until it refuse a message,
612 * that will occur, when a dump started and/or acquisition of
613 * exclusive lock failed.
614 */
615
616static void rtnetlink_rcv(struct sock *sk, int len)
617{
618 do {
619 struct sk_buff *skb;
620
621 if (rtnl_shlock_nowait())
622 return;
623
624 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
625 if (rtnetlink_rcv_skb(skb)) {
626 if (skb->len)
627 skb_queue_head(&sk->sk_receive_queue,
628 skb);
629 else
630 kfree_skb(skb);
631 break;
632 }
633 kfree_skb(skb);
634 }
635
636 up(&rtnl_sem);
637
638 netdev_run_todo();
639 } while (rtnl && rtnl->sk_receive_queue.qlen);
640}
641
642static struct rtnetlink_link link_rtnetlink_table[RTM_MAX-RTM_BASE+1] =
643{
644 [RTM_GETLINK - RTM_BASE] = { .dumpit = rtnetlink_dump_ifinfo },
645 [RTM_SETLINK - RTM_BASE] = { .doit = do_setlink },
646 [RTM_GETADDR - RTM_BASE] = { .dumpit = rtnetlink_dump_all },
647 [RTM_GETROUTE - RTM_BASE] = { .dumpit = rtnetlink_dump_all },
648 [RTM_NEWNEIGH - RTM_BASE] = { .doit = neigh_add },
649 [RTM_DELNEIGH - RTM_BASE] = { .doit = neigh_delete },
650 [RTM_GETNEIGH - RTM_BASE] = { .dumpit = neigh_dump_info }
651};
652
653static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr)
654{
655 struct net_device *dev = ptr;
656 switch (event) {
657 case NETDEV_UNREGISTER:
658 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
659 break;
660 case NETDEV_REGISTER:
661 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
662 break;
663 case NETDEV_UP:
664 case NETDEV_DOWN:
665 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
666 break;
667 case NETDEV_CHANGE:
668 case NETDEV_GOING_DOWN:
669 break;
670 default:
671 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
672 break;
673 }
674 return NOTIFY_DONE;
675}
676
677static struct notifier_block rtnetlink_dev_notifier = {
678 .notifier_call = rtnetlink_event,
679};
680
681void __init rtnetlink_init(void)
682{
683 int i;
684
685 rtattr_max = 0;
686 for (i = 0; i < ARRAY_SIZE(rta_max); i++)
687 if (rta_max[i] > rtattr_max)
688 rtattr_max = rta_max[i];
689 rta_buf = kmalloc(rtattr_max * sizeof(struct rtattr *), GFP_KERNEL);
690 if (!rta_buf)
691 panic("rtnetlink_init: cannot allocate rta_buf\n");
692
693 rtnl = netlink_kernel_create(NETLINK_ROUTE, rtnetlink_rcv);
694 if (rtnl == NULL)
695 panic("rtnetlink_init: cannot initialize rtnetlink\n");
696 netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV);
697 register_netdevice_notifier(&rtnetlink_dev_notifier);
698 rtnetlink_links[PF_UNSPEC] = link_rtnetlink_table;
699 rtnetlink_links[PF_PACKET] = link_rtnetlink_table;
700}
701
702EXPORT_SYMBOL(__rta_fill);
703EXPORT_SYMBOL(rtattr_strlcpy);
704EXPORT_SYMBOL(rtattr_parse);
705EXPORT_SYMBOL(rtnetlink_links);
706EXPORT_SYMBOL(rtnetlink_put_metrics);
707EXPORT_SYMBOL(rtnl);
708EXPORT_SYMBOL(rtnl_lock);
709EXPORT_SYMBOL(rtnl_lock_interruptible);
710EXPORT_SYMBOL(rtnl_sem);
711EXPORT_SYMBOL(rtnl_unlock);
diff --git a/net/core/scm.c b/net/core/scm.c
new file mode 100644
index 000000000000..a2ebf30f6aa8
--- /dev/null
+++ b/net/core/scm.c
@@ -0,0 +1,291 @@
1/* scm.c - Socket level control messages processing.
2 *
3 * Author: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
4 * Alignment and value checking mods by Craig Metz
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/signal.h>
14#include <linux/errno.h>
15#include <linux/sched.h>
16#include <linux/mm.h>
17#include <linux/kernel.h>
18#include <linux/major.h>
19#include <linux/stat.h>
20#include <linux/socket.h>
21#include <linux/file.h>
22#include <linux/fcntl.h>
23#include <linux/net.h>
24#include <linux/interrupt.h>
25#include <linux/netdevice.h>
26#include <linux/security.h>
27
28#include <asm/system.h>
29#include <asm/uaccess.h>
30
31#include <net/protocol.h>
32#include <linux/skbuff.h>
33#include <net/sock.h>
34#include <net/compat.h>
35#include <net/scm.h>
36
37
38/*
39 * Only allow a user to send credentials, that they could set with
40 * setu(g)id.
41 */
42
43static __inline__ int scm_check_creds(struct ucred *creds)
44{
45 if ((creds->pid == current->tgid || capable(CAP_SYS_ADMIN)) &&
46 ((creds->uid == current->uid || creds->uid == current->euid ||
47 creds->uid == current->suid) || capable(CAP_SETUID)) &&
48 ((creds->gid == current->gid || creds->gid == current->egid ||
49 creds->gid == current->sgid) || capable(CAP_SETGID))) {
50 return 0;
51 }
52 return -EPERM;
53}
54
55static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
56{
57 int *fdp = (int*)CMSG_DATA(cmsg);
58 struct scm_fp_list *fpl = *fplp;
59 struct file **fpp;
60 int i, num;
61
62 num = (cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)))/sizeof(int);
63
64 if (num <= 0)
65 return 0;
66
67 if (num > SCM_MAX_FD)
68 return -EINVAL;
69
70 if (!fpl)
71 {
72 fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL);
73 if (!fpl)
74 return -ENOMEM;
75 *fplp = fpl;
76 fpl->count = 0;
77 }
78 fpp = &fpl->fp[fpl->count];
79
80 if (fpl->count + num > SCM_MAX_FD)
81 return -EINVAL;
82
83 /*
84 * Verify the descriptors and increment the usage count.
85 */
86
87 for (i=0; i< num; i++)
88 {
89 int fd = fdp[i];
90 struct file *file;
91
92 if (fd < 0 || !(file = fget(fd)))
93 return -EBADF;
94 *fpp++ = file;
95 fpl->count++;
96 }
97 return num;
98}
99
100void __scm_destroy(struct scm_cookie *scm)
101{
102 struct scm_fp_list *fpl = scm->fp;
103 int i;
104
105 if (fpl) {
106 scm->fp = NULL;
107 for (i=fpl->count-1; i>=0; i--)
108 fput(fpl->fp[i]);
109 kfree(fpl);
110 }
111}
112
113int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
114{
115 struct cmsghdr *cmsg;
116 int err;
117
118 for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg))
119 {
120 err = -EINVAL;
121
122 /* Verify that cmsg_len is at least sizeof(struct cmsghdr) */
123 /* The first check was omitted in <= 2.2.5. The reasoning was
124 that parser checks cmsg_len in any case, so that
125 additional check would be work duplication.
126 But if cmsg_level is not SOL_SOCKET, we do not check
127 for too short ancillary data object at all! Oops.
128 OK, let's add it...
129 */
130 if (!CMSG_OK(msg, cmsg))
131 goto error;
132
133 if (cmsg->cmsg_level != SOL_SOCKET)
134 continue;
135
136 switch (cmsg->cmsg_type)
137 {
138 case SCM_RIGHTS:
139 err=scm_fp_copy(cmsg, &p->fp);
140 if (err<0)
141 goto error;
142 break;
143 case SCM_CREDENTIALS:
144 if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct ucred)))
145 goto error;
146 memcpy(&p->creds, CMSG_DATA(cmsg), sizeof(struct ucred));
147 err = scm_check_creds(&p->creds);
148 if (err)
149 goto error;
150 break;
151 default:
152 goto error;
153 }
154 }
155
156 if (p->fp && !p->fp->count)
157 {
158 kfree(p->fp);
159 p->fp = NULL;
160 }
161 return 0;
162
163error:
164 scm_destroy(p);
165 return err;
166}
167
168int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data)
169{
170 struct cmsghdr __user *cm = (struct cmsghdr __user *)msg->msg_control;
171 struct cmsghdr cmhdr;
172 int cmlen = CMSG_LEN(len);
173 int err;
174
175 if (MSG_CMSG_COMPAT & msg->msg_flags)
176 return put_cmsg_compat(msg, level, type, len, data);
177
178 if (cm==NULL || msg->msg_controllen < sizeof(*cm)) {
179 msg->msg_flags |= MSG_CTRUNC;
180 return 0; /* XXX: return error? check spec. */
181 }
182 if (msg->msg_controllen < cmlen) {
183 msg->msg_flags |= MSG_CTRUNC;
184 cmlen = msg->msg_controllen;
185 }
186 cmhdr.cmsg_level = level;
187 cmhdr.cmsg_type = type;
188 cmhdr.cmsg_len = cmlen;
189
190 err = -EFAULT;
191 if (copy_to_user(cm, &cmhdr, sizeof cmhdr))
192 goto out;
193 if (copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr)))
194 goto out;
195 cmlen = CMSG_SPACE(len);
196 msg->msg_control += cmlen;
197 msg->msg_controllen -= cmlen;
198 err = 0;
199out:
200 return err;
201}
202
203void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
204{
205 struct cmsghdr __user *cm = (struct cmsghdr __user*)msg->msg_control;
206
207 int fdmax = 0;
208 int fdnum = scm->fp->count;
209 struct file **fp = scm->fp->fp;
210 int __user *cmfptr;
211 int err = 0, i;
212
213 if (MSG_CMSG_COMPAT & msg->msg_flags) {
214 scm_detach_fds_compat(msg, scm);
215 return;
216 }
217
218 if (msg->msg_controllen > sizeof(struct cmsghdr))
219 fdmax = ((msg->msg_controllen - sizeof(struct cmsghdr))
220 / sizeof(int));
221
222 if (fdnum < fdmax)
223 fdmax = fdnum;
224
225 for (i=0, cmfptr=(int __user *)CMSG_DATA(cm); i<fdmax; i++, cmfptr++)
226 {
227 int new_fd;
228 err = security_file_receive(fp[i]);
229 if (err)
230 break;
231 err = get_unused_fd();
232 if (err < 0)
233 break;
234 new_fd = err;
235 err = put_user(new_fd, cmfptr);
236 if (err) {
237 put_unused_fd(new_fd);
238 break;
239 }
240 /* Bump the usage count and install the file. */
241 get_file(fp[i]);
242 fd_install(new_fd, fp[i]);
243 }
244
245 if (i > 0)
246 {
247 int cmlen = CMSG_LEN(i*sizeof(int));
248 if (!err)
249 err = put_user(SOL_SOCKET, &cm->cmsg_level);
250 if (!err)
251 err = put_user(SCM_RIGHTS, &cm->cmsg_type);
252 if (!err)
253 err = put_user(cmlen, &cm->cmsg_len);
254 if (!err) {
255 cmlen = CMSG_SPACE(i*sizeof(int));
256 msg->msg_control += cmlen;
257 msg->msg_controllen -= cmlen;
258 }
259 }
260 if (i < fdnum || (fdnum && fdmax <= 0))
261 msg->msg_flags |= MSG_CTRUNC;
262
263 /*
264 * All of the files that fit in the message have had their
265 * usage counts incremented, so we just free the list.
266 */
267 __scm_destroy(scm);
268}
269
270struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
271{
272 struct scm_fp_list *new_fpl;
273 int i;
274
275 if (!fpl)
276 return NULL;
277
278 new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL);
279 if (new_fpl) {
280 for (i=fpl->count-1; i>=0; i--)
281 get_file(fpl->fp[i]);
282 memcpy(new_fpl, fpl, sizeof(*fpl));
283 }
284 return new_fpl;
285}
286
287EXPORT_SYMBOL(__scm_destroy);
288EXPORT_SYMBOL(__scm_send);
289EXPORT_SYMBOL(put_cmsg);
290EXPORT_SYMBOL(scm_detach_fds);
291EXPORT_SYMBOL(scm_fp_dup);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
new file mode 100644
index 000000000000..bf02ca9f80ac
--- /dev/null
+++ b/net/core/skbuff.c
@@ -0,0 +1,1460 @@
1/*
2 * Routines having to do with the 'struct sk_buff' memory handlers.
3 *
4 * Authors: Alan Cox <iiitac@pyr.swan.ac.uk>
5 * Florian La Roche <rzsfl@rz.uni-sb.de>
6 *
7 * Version: $Id: skbuff.c,v 1.90 2001/11/07 05:56:19 davem Exp $
8 *
9 * Fixes:
10 * Alan Cox : Fixed the worst of the load
11 * balancer bugs.
12 * Dave Platt : Interrupt stacking fix.
13 * Richard Kooijman : Timestamp fixes.
14 * Alan Cox : Changed buffer format.
15 * Alan Cox : destructor hook for AF_UNIX etc.
16 * Linus Torvalds : Better skb_clone.
17 * Alan Cox : Added skb_copy.
18 * Alan Cox : Added all the changed routines Linus
19 * only put in the headers
20 * Ray VanTassle : Fixed --skb->lock in free
21 * Alan Cox : skb_copy copy arp field
22 * Andi Kleen : slabified it.
23 * Robert Olsson : Removed skb_head_pool
24 *
25 * NOTE:
26 * The __skb_ routines should be called with interrupts
27 * disabled, or you better be *real* sure that the operation is atomic
28 * with respect to whatever list is being frobbed (e.g. via lock_sock()
29 * or via disabling bottom half handlers, etc).
30 *
31 * This program is free software; you can redistribute it and/or
32 * modify it under the terms of the GNU General Public License
33 * as published by the Free Software Foundation; either version
34 * 2 of the License, or (at your option) any later version.
35 */
36
37/*
38 * The functions in this file will not compile correctly with gcc 2.4.x
39 */
40
41#include <linux/config.h>
42#include <linux/module.h>
43#include <linux/types.h>
44#include <linux/kernel.h>
45#include <linux/sched.h>
46#include <linux/mm.h>
47#include <linux/interrupt.h>
48#include <linux/in.h>
49#include <linux/inet.h>
50#include <linux/slab.h>
51#include <linux/netdevice.h>
52#ifdef CONFIG_NET_CLS_ACT
53#include <net/pkt_sched.h>
54#endif
55#include <linux/string.h>
56#include <linux/skbuff.h>
57#include <linux/cache.h>
58#include <linux/rtnetlink.h>
59#include <linux/init.h>
60#include <linux/highmem.h>
61
62#include <net/protocol.h>
63#include <net/dst.h>
64#include <net/sock.h>
65#include <net/checksum.h>
66#include <net/xfrm.h>
67
68#include <asm/uaccess.h>
69#include <asm/system.h>
70
71static kmem_cache_t *skbuff_head_cache;
72
73/*
74 * Keep out-of-line to prevent kernel bloat.
75 * __builtin_return_address is not used because it is not always
76 * reliable.
77 */
78
79/**
80 * skb_over_panic - private function
81 * @skb: buffer
82 * @sz: size
83 * @here: address
84 *
85 * Out of line support code for skb_put(). Not user callable.
86 */
87void skb_over_panic(struct sk_buff *skb, int sz, void *here)
88{
89 printk(KERN_INFO "skput:over: %p:%d put:%d dev:%s",
90 here, skb->len, sz, skb->dev ? skb->dev->name : "<NULL>");
91 BUG();
92}
93
94/**
95 * skb_under_panic - private function
96 * @skb: buffer
97 * @sz: size
98 * @here: address
99 *
100 * Out of line support code for skb_push(). Not user callable.
101 */
102
103void skb_under_panic(struct sk_buff *skb, int sz, void *here)
104{
105 printk(KERN_INFO "skput:under: %p:%d put:%d dev:%s",
106 here, skb->len, sz, skb->dev ? skb->dev->name : "<NULL>");
107 BUG();
108}
109
110/* Allocate a new skbuff. We do this ourselves so we can fill in a few
111 * 'private' fields and also do memory statistics to find all the
112 * [BEEP] leaks.
113 *
114 */
115
116/**
117 * alloc_skb - allocate a network buffer
118 * @size: size to allocate
119 * @gfp_mask: allocation mask
120 *
121 * Allocate a new &sk_buff. The returned buffer has no headroom and a
122 * tail room of size bytes. The object has a reference count of one.
123 * The return is the buffer. On a failure the return is %NULL.
124 *
125 * Buffers may only be allocated from interrupts using a @gfp_mask of
126 * %GFP_ATOMIC.
127 */
128struct sk_buff *alloc_skb(unsigned int size, int gfp_mask)
129{
130 struct sk_buff *skb;
131 u8 *data;
132
133 /* Get the HEAD */
134 skb = kmem_cache_alloc(skbuff_head_cache,
135 gfp_mask & ~__GFP_DMA);
136 if (!skb)
137 goto out;
138
139 /* Get the DATA. Size must match skb_add_mtu(). */
140 size = SKB_DATA_ALIGN(size);
141 data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
142 if (!data)
143 goto nodata;
144
145 memset(skb, 0, offsetof(struct sk_buff, truesize));
146 skb->truesize = size + sizeof(struct sk_buff);
147 atomic_set(&skb->users, 1);
148 skb->head = data;
149 skb->data = data;
150 skb->tail = data;
151 skb->end = data + size;
152
153 atomic_set(&(skb_shinfo(skb)->dataref), 1);
154 skb_shinfo(skb)->nr_frags = 0;
155 skb_shinfo(skb)->tso_size = 0;
156 skb_shinfo(skb)->tso_segs = 0;
157 skb_shinfo(skb)->frag_list = NULL;
158out:
159 return skb;
160nodata:
161 kmem_cache_free(skbuff_head_cache, skb);
162 skb = NULL;
163 goto out;
164}
165
166/**
167 * alloc_skb_from_cache - allocate a network buffer
168 * @cp: kmem_cache from which to allocate the data area
169 * (object size must be big enough for @size bytes + skb overheads)
170 * @size: size to allocate
171 * @gfp_mask: allocation mask
172 *
173 * Allocate a new &sk_buff. The returned buffer has no headroom and
174 * tail room of size bytes. The object has a reference count of one.
175 * The return is the buffer. On a failure the return is %NULL.
176 *
177 * Buffers may only be allocated from interrupts using a @gfp_mask of
178 * %GFP_ATOMIC.
179 */
180struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
181 unsigned int size, int gfp_mask)
182{
183 struct sk_buff *skb;
184 u8 *data;
185
186 /* Get the HEAD */
187 skb = kmem_cache_alloc(skbuff_head_cache,
188 gfp_mask & ~__GFP_DMA);
189 if (!skb)
190 goto out;
191
192 /* Get the DATA. */
193 size = SKB_DATA_ALIGN(size);
194 data = kmem_cache_alloc(cp, gfp_mask);
195 if (!data)
196 goto nodata;
197
198 memset(skb, 0, offsetof(struct sk_buff, truesize));
199 skb->truesize = size + sizeof(struct sk_buff);
200 atomic_set(&skb->users, 1);
201 skb->head = data;
202 skb->data = data;
203 skb->tail = data;
204 skb->end = data + size;
205
206 atomic_set(&(skb_shinfo(skb)->dataref), 1);
207 skb_shinfo(skb)->nr_frags = 0;
208 skb_shinfo(skb)->tso_size = 0;
209 skb_shinfo(skb)->tso_segs = 0;
210 skb_shinfo(skb)->frag_list = NULL;
211out:
212 return skb;
213nodata:
214 kmem_cache_free(skbuff_head_cache, skb);
215 skb = NULL;
216 goto out;
217}
218
219
220static void skb_drop_fraglist(struct sk_buff *skb)
221{
222 struct sk_buff *list = skb_shinfo(skb)->frag_list;
223
224 skb_shinfo(skb)->frag_list = NULL;
225
226 do {
227 struct sk_buff *this = list;
228 list = list->next;
229 kfree_skb(this);
230 } while (list);
231}
232
233static void skb_clone_fraglist(struct sk_buff *skb)
234{
235 struct sk_buff *list;
236
237 for (list = skb_shinfo(skb)->frag_list; list; list = list->next)
238 skb_get(list);
239}
240
241void skb_release_data(struct sk_buff *skb)
242{
243 if (!skb->cloned ||
244 !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
245 &skb_shinfo(skb)->dataref)) {
246 if (skb_shinfo(skb)->nr_frags) {
247 int i;
248 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
249 put_page(skb_shinfo(skb)->frags[i].page);
250 }
251
252 if (skb_shinfo(skb)->frag_list)
253 skb_drop_fraglist(skb);
254
255 kfree(skb->head);
256 }
257}
258
259/*
260 * Free an skbuff by memory without cleaning the state.
261 */
262void kfree_skbmem(struct sk_buff *skb)
263{
264 skb_release_data(skb);
265 kmem_cache_free(skbuff_head_cache, skb);
266}
267
268/**
269 * __kfree_skb - private function
270 * @skb: buffer
271 *
272 * Free an sk_buff. Release anything attached to the buffer.
273 * Clean the state. This is an internal helper function. Users should
274 * always call kfree_skb
275 */
276
277void __kfree_skb(struct sk_buff *skb)
278{
279 if (skb->list) {
280 printk(KERN_WARNING "Warning: kfree_skb passed an skb still "
281 "on a list (from %p).\n", NET_CALLER(skb));
282 BUG();
283 }
284
285 dst_release(skb->dst);
286#ifdef CONFIG_XFRM
287 secpath_put(skb->sp);
288#endif
289 if(skb->destructor) {
290 if (in_irq())
291 printk(KERN_WARNING "Warning: kfree_skb on "
292 "hard IRQ %p\n", NET_CALLER(skb));
293 skb->destructor(skb);
294 }
295#ifdef CONFIG_NETFILTER
296 nf_conntrack_put(skb->nfct);
297#ifdef CONFIG_BRIDGE_NETFILTER
298 nf_bridge_put(skb->nf_bridge);
299#endif
300#endif
301/* XXX: IS this still necessary? - JHS */
302#ifdef CONFIG_NET_SCHED
303 skb->tc_index = 0;
304#ifdef CONFIG_NET_CLS_ACT
305 skb->tc_verd = 0;
306 skb->tc_classid = 0;
307#endif
308#endif
309
310 kfree_skbmem(skb);
311}
312
313/**
314 * skb_clone - duplicate an sk_buff
315 * @skb: buffer to clone
316 * @gfp_mask: allocation priority
317 *
318 * Duplicate an &sk_buff. The new one is not owned by a socket. Both
319 * copies share the same packet data but not structure. The new
320 * buffer has a reference count of 1. If the allocation fails the
321 * function returns %NULL otherwise the new buffer is returned.
322 *
323 * If this function is called from an interrupt gfp_mask() must be
324 * %GFP_ATOMIC.
325 */
326
327struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask)
328{
329 struct sk_buff *n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
330
331 if (!n)
332 return NULL;
333
334#define C(x) n->x = skb->x
335
336 n->next = n->prev = NULL;
337 n->list = NULL;
338 n->sk = NULL;
339 C(stamp);
340 C(dev);
341 C(real_dev);
342 C(h);
343 C(nh);
344 C(mac);
345 C(dst);
346 dst_clone(skb->dst);
347 C(sp);
348#ifdef CONFIG_INET
349 secpath_get(skb->sp);
350#endif
351 memcpy(n->cb, skb->cb, sizeof(skb->cb));
352 C(len);
353 C(data_len);
354 C(csum);
355 C(local_df);
356 n->cloned = 1;
357 n->nohdr = 0;
358 C(pkt_type);
359 C(ip_summed);
360 C(priority);
361 C(protocol);
362 C(security);
363 n->destructor = NULL;
364#ifdef CONFIG_NETFILTER
365 C(nfmark);
366 C(nfcache);
367 C(nfct);
368 nf_conntrack_get(skb->nfct);
369 C(nfctinfo);
370#ifdef CONFIG_NETFILTER_DEBUG
371 C(nf_debug);
372#endif
373#ifdef CONFIG_BRIDGE_NETFILTER
374 C(nf_bridge);
375 nf_bridge_get(skb->nf_bridge);
376#endif
377#endif /*CONFIG_NETFILTER*/
378#if defined(CONFIG_HIPPI)
379 C(private);
380#endif
381#ifdef CONFIG_NET_SCHED
382 C(tc_index);
383#ifdef CONFIG_NET_CLS_ACT
384 n->tc_verd = SET_TC_VERD(skb->tc_verd,0);
385 n->tc_verd = CLR_TC_OK2MUNGE(skb->tc_verd);
386 n->tc_verd = CLR_TC_MUNGED(skb->tc_verd);
387 C(input_dev);
388 C(tc_classid);
389#endif
390
391#endif
392 C(truesize);
393 atomic_set(&n->users, 1);
394 C(head);
395 C(data);
396 C(tail);
397 C(end);
398
399 atomic_inc(&(skb_shinfo(skb)->dataref));
400 skb->cloned = 1;
401
402 return n;
403}
404
405static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
406{
407 /*
408 * Shift between the two data areas in bytes
409 */
410 unsigned long offset = new->data - old->data;
411
412 new->list = NULL;
413 new->sk = NULL;
414 new->dev = old->dev;
415 new->real_dev = old->real_dev;
416 new->priority = old->priority;
417 new->protocol = old->protocol;
418 new->dst = dst_clone(old->dst);
419#ifdef CONFIG_INET
420 new->sp = secpath_get(old->sp);
421#endif
422 new->h.raw = old->h.raw + offset;
423 new->nh.raw = old->nh.raw + offset;
424 new->mac.raw = old->mac.raw + offset;
425 memcpy(new->cb, old->cb, sizeof(old->cb));
426 new->local_df = old->local_df;
427 new->pkt_type = old->pkt_type;
428 new->stamp = old->stamp;
429 new->destructor = NULL;
430 new->security = old->security;
431#ifdef CONFIG_NETFILTER
432 new->nfmark = old->nfmark;
433 new->nfcache = old->nfcache;
434 new->nfct = old->nfct;
435 nf_conntrack_get(old->nfct);
436 new->nfctinfo = old->nfctinfo;
437#ifdef CONFIG_NETFILTER_DEBUG
438 new->nf_debug = old->nf_debug;
439#endif
440#ifdef CONFIG_BRIDGE_NETFILTER
441 new->nf_bridge = old->nf_bridge;
442 nf_bridge_get(old->nf_bridge);
443#endif
444#endif
445#ifdef CONFIG_NET_SCHED
446#ifdef CONFIG_NET_CLS_ACT
447 new->tc_verd = old->tc_verd;
448#endif
449 new->tc_index = old->tc_index;
450#endif
451 atomic_set(&new->users, 1);
452 skb_shinfo(new)->tso_size = skb_shinfo(old)->tso_size;
453 skb_shinfo(new)->tso_segs = skb_shinfo(old)->tso_segs;
454}
455
456/**
457 * skb_copy - create private copy of an sk_buff
458 * @skb: buffer to copy
459 * @gfp_mask: allocation priority
460 *
461 * Make a copy of both an &sk_buff and its data. This is used when the
462 * caller wishes to modify the data and needs a private copy of the
463 * data to alter. Returns %NULL on failure or the pointer to the buffer
464 * on success. The returned buffer has a reference count of 1.
465 *
466 * As by-product this function converts non-linear &sk_buff to linear
467 * one, so that &sk_buff becomes completely private and caller is allowed
468 * to modify all the data of returned buffer. This means that this
469 * function is not recommended for use in circumstances when only
470 * header is going to be modified. Use pskb_copy() instead.
471 */
472
473struct sk_buff *skb_copy(const struct sk_buff *skb, int gfp_mask)
474{
475 int headerlen = skb->data - skb->head;
476 /*
477 * Allocate the copy buffer
478 */
479 struct sk_buff *n = alloc_skb(skb->end - skb->head + skb->data_len,
480 gfp_mask);
481 if (!n)
482 return NULL;
483
484 /* Set the data pointer */
485 skb_reserve(n, headerlen);
486 /* Set the tail pointer and length */
487 skb_put(n, skb->len);
488 n->csum = skb->csum;
489 n->ip_summed = skb->ip_summed;
490
491 if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len))
492 BUG();
493
494 copy_skb_header(n, skb);
495 return n;
496}
497
498
499/**
500 * pskb_copy - create copy of an sk_buff with private head.
501 * @skb: buffer to copy
502 * @gfp_mask: allocation priority
503 *
504 * Make a copy of both an &sk_buff and part of its data, located
505 * in header. Fragmented data remain shared. This is used when
506 * the caller wishes to modify only header of &sk_buff and needs
507 * private copy of the header to alter. Returns %NULL on failure
508 * or the pointer to the buffer on success.
509 * The returned buffer has a reference count of 1.
510 */
511
512struct sk_buff *pskb_copy(struct sk_buff *skb, int gfp_mask)
513{
514 /*
515 * Allocate the copy buffer
516 */
517 struct sk_buff *n = alloc_skb(skb->end - skb->head, gfp_mask);
518
519 if (!n)
520 goto out;
521
522 /* Set the data pointer */
523 skb_reserve(n, skb->data - skb->head);
524 /* Set the tail pointer and length */
525 skb_put(n, skb_headlen(skb));
526 /* Copy the bytes */
527 memcpy(n->data, skb->data, n->len);
528 n->csum = skb->csum;
529 n->ip_summed = skb->ip_summed;
530
531 n->data_len = skb->data_len;
532 n->len = skb->len;
533
534 if (skb_shinfo(skb)->nr_frags) {
535 int i;
536
537 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
538 skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
539 get_page(skb_shinfo(n)->frags[i].page);
540 }
541 skb_shinfo(n)->nr_frags = i;
542 }
543
544 if (skb_shinfo(skb)->frag_list) {
545 skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
546 skb_clone_fraglist(n);
547 }
548
549 copy_skb_header(n, skb);
550out:
551 return n;
552}
553
554/**
555 * pskb_expand_head - reallocate header of &sk_buff
556 * @skb: buffer to reallocate
557 * @nhead: room to add at head
558 * @ntail: room to add at tail
559 * @gfp_mask: allocation priority
560 *
561 * Expands (or creates identical copy, if &nhead and &ntail are zero)
562 * header of skb. &sk_buff itself is not changed. &sk_buff MUST have
563 * reference count of 1. Returns zero in the case of success or error,
564 * if expansion failed. In the last case, &sk_buff is not changed.
565 *
566 * All the pointers pointing into skb header may change and must be
567 * reloaded after call to this function.
568 */
569
570int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, int gfp_mask)
571{
572 int i;
573 u8 *data;
574 int size = nhead + (skb->end - skb->head) + ntail;
575 long off;
576
577 if (skb_shared(skb))
578 BUG();
579
580 size = SKB_DATA_ALIGN(size);
581
582 data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
583 if (!data)
584 goto nodata;
585
586 /* Copy only real data... and, alas, header. This should be
587 * optimized for the cases when header is void. */
588 memcpy(data + nhead, skb->head, skb->tail - skb->head);
589 memcpy(data + size, skb->end, sizeof(struct skb_shared_info));
590
591 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
592 get_page(skb_shinfo(skb)->frags[i].page);
593
594 if (skb_shinfo(skb)->frag_list)
595 skb_clone_fraglist(skb);
596
597 skb_release_data(skb);
598
599 off = (data + nhead) - skb->head;
600
601 skb->head = data;
602 skb->end = data + size;
603 skb->data += off;
604 skb->tail += off;
605 skb->mac.raw += off;
606 skb->h.raw += off;
607 skb->nh.raw += off;
608 skb->cloned = 0;
609 skb->nohdr = 0;
610 atomic_set(&skb_shinfo(skb)->dataref, 1);
611 return 0;
612
613nodata:
614 return -ENOMEM;
615}
616
617/* Make private copy of skb with writable head and some headroom */
618
619struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
620{
621 struct sk_buff *skb2;
622 int delta = headroom - skb_headroom(skb);
623
624 if (delta <= 0)
625 skb2 = pskb_copy(skb, GFP_ATOMIC);
626 else {
627 skb2 = skb_clone(skb, GFP_ATOMIC);
628 if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0,
629 GFP_ATOMIC)) {
630 kfree_skb(skb2);
631 skb2 = NULL;
632 }
633 }
634 return skb2;
635}
636
637
638/**
639 * skb_copy_expand - copy and expand sk_buff
640 * @skb: buffer to copy
641 * @newheadroom: new free bytes at head
642 * @newtailroom: new free bytes at tail
643 * @gfp_mask: allocation priority
644 *
645 * Make a copy of both an &sk_buff and its data and while doing so
646 * allocate additional space.
647 *
648 * This is used when the caller wishes to modify the data and needs a
649 * private copy of the data to alter as well as more space for new fields.
650 * Returns %NULL on failure or the pointer to the buffer
651 * on success. The returned buffer has a reference count of 1.
652 *
653 * You must pass %GFP_ATOMIC as the allocation priority if this function
654 * is called from an interrupt.
655 *
656 * BUG ALERT: ip_summed is not copied. Why does this work? Is it used
657 * only by netfilter in the cases when checksum is recalculated? --ANK
658 */
659struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
660 int newheadroom, int newtailroom, int gfp_mask)
661{
662 /*
663 * Allocate the copy buffer
664 */
665 struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom,
666 gfp_mask);
667 int head_copy_len, head_copy_off;
668
669 if (!n)
670 return NULL;
671
672 skb_reserve(n, newheadroom);
673
674 /* Set the tail pointer and length */
675 skb_put(n, skb->len);
676
677 head_copy_len = skb_headroom(skb);
678 head_copy_off = 0;
679 if (newheadroom <= head_copy_len)
680 head_copy_len = newheadroom;
681 else
682 head_copy_off = newheadroom - head_copy_len;
683
684 /* Copy the linear header and data. */
685 if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
686 skb->len + head_copy_len))
687 BUG();
688
689 copy_skb_header(n, skb);
690
691 return n;
692}
693
694/**
695 * skb_pad - zero pad the tail of an skb
696 * @skb: buffer to pad
697 * @pad: space to pad
698 *
699 * Ensure that a buffer is followed by a padding area that is zero
700 * filled. Used by network drivers which may DMA or transfer data
701 * beyond the buffer end onto the wire.
702 *
703 * May return NULL in out of memory cases.
704 */
705
706struct sk_buff *skb_pad(struct sk_buff *skb, int pad)
707{
708 struct sk_buff *nskb;
709
710 /* If the skbuff is non linear tailroom is always zero.. */
711 if (skb_tailroom(skb) >= pad) {
712 memset(skb->data+skb->len, 0, pad);
713 return skb;
714 }
715
716 nskb = skb_copy_expand(skb, skb_headroom(skb), skb_tailroom(skb) + pad, GFP_ATOMIC);
717 kfree_skb(skb);
718 if (nskb)
719 memset(nskb->data+nskb->len, 0, pad);
720 return nskb;
721}
722
723/* Trims skb to length len. It can change skb pointers, if "realloc" is 1.
724 * If realloc==0 and trimming is impossible without change of data,
725 * it is BUG().
726 */
727
728int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc)
729{
730 int offset = skb_headlen(skb);
731 int nfrags = skb_shinfo(skb)->nr_frags;
732 int i;
733
734 for (i = 0; i < nfrags; i++) {
735 int end = offset + skb_shinfo(skb)->frags[i].size;
736 if (end > len) {
737 if (skb_cloned(skb)) {
738 if (!realloc)
739 BUG();
740 if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
741 return -ENOMEM;
742 }
743 if (len <= offset) {
744 put_page(skb_shinfo(skb)->frags[i].page);
745 skb_shinfo(skb)->nr_frags--;
746 } else {
747 skb_shinfo(skb)->frags[i].size = len - offset;
748 }
749 }
750 offset = end;
751 }
752
753 if (offset < len) {
754 skb->data_len -= skb->len - len;
755 skb->len = len;
756 } else {
757 if (len <= skb_headlen(skb)) {
758 skb->len = len;
759 skb->data_len = 0;
760 skb->tail = skb->data + len;
761 if (skb_shinfo(skb)->frag_list && !skb_cloned(skb))
762 skb_drop_fraglist(skb);
763 } else {
764 skb->data_len -= skb->len - len;
765 skb->len = len;
766 }
767 }
768
769 return 0;
770}
771
772/**
773 * __pskb_pull_tail - advance tail of skb header
774 * @skb: buffer to reallocate
775 * @delta: number of bytes to advance tail
776 *
777 * The function makes a sense only on a fragmented &sk_buff,
778 * it expands header moving its tail forward and copying necessary
779 * data from fragmented part.
780 *
781 * &sk_buff MUST have reference count of 1.
782 *
783 * Returns %NULL (and &sk_buff does not change) if pull failed
784 * or value of new tail of skb in the case of success.
785 *
786 * All the pointers pointing into skb header may change and must be
787 * reloaded after call to this function.
788 */
789
790/* Moves tail of skb head forward, copying data from fragmented part,
791 * when it is necessary.
792 * 1. It may fail due to malloc failure.
793 * 2. It may change skb pointers.
794 *
795 * It is pretty complicated. Luckily, it is called only in exceptional cases.
796 */
797unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)
798{
799 /* If skb has not enough free space at tail, get new one
800 * plus 128 bytes for future expansions. If we have enough
801 * room at tail, reallocate without expansion only if skb is cloned.
802 */
803 int i, k, eat = (skb->tail + delta) - skb->end;
804
805 if (eat > 0 || skb_cloned(skb)) {
806 if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
807 GFP_ATOMIC))
808 return NULL;
809 }
810
811 if (skb_copy_bits(skb, skb_headlen(skb), skb->tail, delta))
812 BUG();
813
814 /* Optimization: no fragments, no reasons to preestimate
815 * size of pulled pages. Superb.
816 */
817 if (!skb_shinfo(skb)->frag_list)
818 goto pull_pages;
819
820 /* Estimate size of pulled pages. */
821 eat = delta;
822 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
823 if (skb_shinfo(skb)->frags[i].size >= eat)
824 goto pull_pages;
825 eat -= skb_shinfo(skb)->frags[i].size;
826 }
827
828 /* If we need update frag list, we are in troubles.
829 * Certainly, it possible to add an offset to skb data,
830 * but taking into account that pulling is expected to
831 * be very rare operation, it is worth to fight against
832 * further bloating skb head and crucify ourselves here instead.
833 * Pure masohism, indeed. 8)8)
834 */
835 if (eat) {
836 struct sk_buff *list = skb_shinfo(skb)->frag_list;
837 struct sk_buff *clone = NULL;
838 struct sk_buff *insp = NULL;
839
840 do {
841 if (!list)
842 BUG();
843
844 if (list->len <= eat) {
845 /* Eaten as whole. */
846 eat -= list->len;
847 list = list->next;
848 insp = list;
849 } else {
850 /* Eaten partially. */
851
852 if (skb_shared(list)) {
853 /* Sucks! We need to fork list. :-( */
854 clone = skb_clone(list, GFP_ATOMIC);
855 if (!clone)
856 return NULL;
857 insp = list->next;
858 list = clone;
859 } else {
860 /* This may be pulled without
861 * problems. */
862 insp = list;
863 }
864 if (!pskb_pull(list, eat)) {
865 if (clone)
866 kfree_skb(clone);
867 return NULL;
868 }
869 break;
870 }
871 } while (eat);
872
873 /* Free pulled out fragments. */
874 while ((list = skb_shinfo(skb)->frag_list) != insp) {
875 skb_shinfo(skb)->frag_list = list->next;
876 kfree_skb(list);
877 }
878 /* And insert new clone at head. */
879 if (clone) {
880 clone->next = list;
881 skb_shinfo(skb)->frag_list = clone;
882 }
883 }
884 /* Success! Now we may commit changes to skb data. */
885
886pull_pages:
887 eat = delta;
888 k = 0;
889 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
890 if (skb_shinfo(skb)->frags[i].size <= eat) {
891 put_page(skb_shinfo(skb)->frags[i].page);
892 eat -= skb_shinfo(skb)->frags[i].size;
893 } else {
894 skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
895 if (eat) {
896 skb_shinfo(skb)->frags[k].page_offset += eat;
897 skb_shinfo(skb)->frags[k].size -= eat;
898 eat = 0;
899 }
900 k++;
901 }
902 }
903 skb_shinfo(skb)->nr_frags = k;
904
905 skb->tail += delta;
906 skb->data_len -= delta;
907
908 return skb->tail;
909}
910
911/* Copy some data bits from skb to kernel buffer. */
912
913int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
914{
915 int i, copy;
916 int start = skb_headlen(skb);
917
918 if (offset > (int)skb->len - len)
919 goto fault;
920
921 /* Copy header. */
922 if ((copy = start - offset) > 0) {
923 if (copy > len)
924 copy = len;
925 memcpy(to, skb->data + offset, copy);
926 if ((len -= copy) == 0)
927 return 0;
928 offset += copy;
929 to += copy;
930 }
931
932 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
933 int end;
934
935 BUG_TRAP(start <= offset + len);
936
937 end = start + skb_shinfo(skb)->frags[i].size;
938 if ((copy = end - offset) > 0) {
939 u8 *vaddr;
940
941 if (copy > len)
942 copy = len;
943
944 vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]);
945 memcpy(to,
946 vaddr + skb_shinfo(skb)->frags[i].page_offset+
947 offset - start, copy);
948 kunmap_skb_frag(vaddr);
949
950 if ((len -= copy) == 0)
951 return 0;
952 offset += copy;
953 to += copy;
954 }
955 start = end;
956 }
957
958 if (skb_shinfo(skb)->frag_list) {
959 struct sk_buff *list = skb_shinfo(skb)->frag_list;
960
961 for (; list; list = list->next) {
962 int end;
963
964 BUG_TRAP(start <= offset + len);
965
966 end = start + list->len;
967 if ((copy = end - offset) > 0) {
968 if (copy > len)
969 copy = len;
970 if (skb_copy_bits(list, offset - start,
971 to, copy))
972 goto fault;
973 if ((len -= copy) == 0)
974 return 0;
975 offset += copy;
976 to += copy;
977 }
978 start = end;
979 }
980 }
981 if (!len)
982 return 0;
983
984fault:
985 return -EFAULT;
986}
987
988/* Checksum skb data. */
989
990unsigned int skb_checksum(const struct sk_buff *skb, int offset,
991 int len, unsigned int csum)
992{
993 int start = skb_headlen(skb);
994 int i, copy = start - offset;
995 int pos = 0;
996
997 /* Checksum header. */
998 if (copy > 0) {
999 if (copy > len)
1000 copy = len;
1001 csum = csum_partial(skb->data + offset, copy, csum);
1002 if ((len -= copy) == 0)
1003 return csum;
1004 offset += copy;
1005 pos = copy;
1006 }
1007
1008 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1009 int end;
1010
1011 BUG_TRAP(start <= offset + len);
1012
1013 end = start + skb_shinfo(skb)->frags[i].size;
1014 if ((copy = end - offset) > 0) {
1015 unsigned int csum2;
1016 u8 *vaddr;
1017 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
1018
1019 if (copy > len)
1020 copy = len;
1021 vaddr = kmap_skb_frag(frag);
1022 csum2 = csum_partial(vaddr + frag->page_offset +
1023 offset - start, copy, 0);
1024 kunmap_skb_frag(vaddr);
1025 csum = csum_block_add(csum, csum2, pos);
1026 if (!(len -= copy))
1027 return csum;
1028 offset += copy;
1029 pos += copy;
1030 }
1031 start = end;
1032 }
1033
1034 if (skb_shinfo(skb)->frag_list) {
1035 struct sk_buff *list = skb_shinfo(skb)->frag_list;
1036
1037 for (; list; list = list->next) {
1038 int end;
1039
1040 BUG_TRAP(start <= offset + len);
1041
1042 end = start + list->len;
1043 if ((copy = end - offset) > 0) {
1044 unsigned int csum2;
1045 if (copy > len)
1046 copy = len;
1047 csum2 = skb_checksum(list, offset - start,
1048 copy, 0);
1049 csum = csum_block_add(csum, csum2, pos);
1050 if ((len -= copy) == 0)
1051 return csum;
1052 offset += copy;
1053 pos += copy;
1054 }
1055 start = end;
1056 }
1057 }
1058 if (len)
1059 BUG();
1060
1061 return csum;
1062}
1063
1064/* Both of above in one bottle. */
1065
1066unsigned int skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
1067 u8 *to, int len, unsigned int csum)
1068{
1069 int start = skb_headlen(skb);
1070 int i, copy = start - offset;
1071 int pos = 0;
1072
1073 /* Copy header. */
1074 if (copy > 0) {
1075 if (copy > len)
1076 copy = len;
1077 csum = csum_partial_copy_nocheck(skb->data + offset, to,
1078 copy, csum);
1079 if ((len -= copy) == 0)
1080 return csum;
1081 offset += copy;
1082 to += copy;
1083 pos = copy;
1084 }
1085
1086 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1087 int end;
1088
1089 BUG_TRAP(start <= offset + len);
1090
1091 end = start + skb_shinfo(skb)->frags[i].size;
1092 if ((copy = end - offset) > 0) {
1093 unsigned int csum2;
1094 u8 *vaddr;
1095 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
1096
1097 if (copy > len)
1098 copy = len;
1099 vaddr = kmap_skb_frag(frag);
1100 csum2 = csum_partial_copy_nocheck(vaddr +
1101 frag->page_offset +
1102 offset - start, to,
1103 copy, 0);
1104 kunmap_skb_frag(vaddr);
1105 csum = csum_block_add(csum, csum2, pos);
1106 if (!(len -= copy))
1107 return csum;
1108 offset += copy;
1109 to += copy;
1110 pos += copy;
1111 }
1112 start = end;
1113 }
1114
1115 if (skb_shinfo(skb)->frag_list) {
1116 struct sk_buff *list = skb_shinfo(skb)->frag_list;
1117
1118 for (; list; list = list->next) {
1119 unsigned int csum2;
1120 int end;
1121
1122 BUG_TRAP(start <= offset + len);
1123
1124 end = start + list->len;
1125 if ((copy = end - offset) > 0) {
1126 if (copy > len)
1127 copy = len;
1128 csum2 = skb_copy_and_csum_bits(list,
1129 offset - start,
1130 to, copy, 0);
1131 csum = csum_block_add(csum, csum2, pos);
1132 if ((len -= copy) == 0)
1133 return csum;
1134 offset += copy;
1135 to += copy;
1136 pos += copy;
1137 }
1138 start = end;
1139 }
1140 }
1141 if (len)
1142 BUG();
1143 return csum;
1144}
1145
1146void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
1147{
1148 unsigned int csum;
1149 long csstart;
1150
1151 if (skb->ip_summed == CHECKSUM_HW)
1152 csstart = skb->h.raw - skb->data;
1153 else
1154 csstart = skb_headlen(skb);
1155
1156 if (csstart > skb_headlen(skb))
1157 BUG();
1158
1159 memcpy(to, skb->data, csstart);
1160
1161 csum = 0;
1162 if (csstart != skb->len)
1163 csum = skb_copy_and_csum_bits(skb, csstart, to + csstart,
1164 skb->len - csstart, 0);
1165
1166 if (skb->ip_summed == CHECKSUM_HW) {
1167 long csstuff = csstart + skb->csum;
1168
1169 *((unsigned short *)(to + csstuff)) = csum_fold(csum);
1170 }
1171}
1172
1173/**
1174 * skb_dequeue - remove from the head of the queue
1175 * @list: list to dequeue from
1176 *
1177 * Remove the head of the list. The list lock is taken so the function
1178 * may be used safely with other locking list functions. The head item is
1179 * returned or %NULL if the list is empty.
1180 */
1181
1182struct sk_buff *skb_dequeue(struct sk_buff_head *list)
1183{
1184 unsigned long flags;
1185 struct sk_buff *result;
1186
1187 spin_lock_irqsave(&list->lock, flags);
1188 result = __skb_dequeue(list);
1189 spin_unlock_irqrestore(&list->lock, flags);
1190 return result;
1191}
1192
1193/**
1194 * skb_dequeue_tail - remove from the tail of the queue
1195 * @list: list to dequeue from
1196 *
1197 * Remove the tail of the list. The list lock is taken so the function
1198 * may be used safely with other locking list functions. The tail item is
1199 * returned or %NULL if the list is empty.
1200 */
1201struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list)
1202{
1203 unsigned long flags;
1204 struct sk_buff *result;
1205
1206 spin_lock_irqsave(&list->lock, flags);
1207 result = __skb_dequeue_tail(list);
1208 spin_unlock_irqrestore(&list->lock, flags);
1209 return result;
1210}
1211
1212/**
1213 * skb_queue_purge - empty a list
1214 * @list: list to empty
1215 *
1216 * Delete all buffers on an &sk_buff list. Each buffer is removed from
1217 * the list and one reference dropped. This function takes the list
1218 * lock and is atomic with respect to other list locking functions.
1219 */
1220void skb_queue_purge(struct sk_buff_head *list)
1221{
1222 struct sk_buff *skb;
1223 while ((skb = skb_dequeue(list)) != NULL)
1224 kfree_skb(skb);
1225}
1226
1227/**
1228 * skb_queue_head - queue a buffer at the list head
1229 * @list: list to use
1230 * @newsk: buffer to queue
1231 *
1232 * Queue a buffer at the start of the list. This function takes the
1233 * list lock and can be used safely with other locking &sk_buff functions
1234 * safely.
1235 *
1236 * A buffer cannot be placed on two lists at the same time.
1237 */
1238void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk)
1239{
1240 unsigned long flags;
1241
1242 spin_lock_irqsave(&list->lock, flags);
1243 __skb_queue_head(list, newsk);
1244 spin_unlock_irqrestore(&list->lock, flags);
1245}
1246
1247/**
1248 * skb_queue_tail - queue a buffer at the list tail
1249 * @list: list to use
1250 * @newsk: buffer to queue
1251 *
1252 * Queue a buffer at the tail of the list. This function takes the
1253 * list lock and can be used safely with other locking &sk_buff functions
1254 * safely.
1255 *
1256 * A buffer cannot be placed on two lists at the same time.
1257 */
1258void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
1259{
1260 unsigned long flags;
1261
1262 spin_lock_irqsave(&list->lock, flags);
1263 __skb_queue_tail(list, newsk);
1264 spin_unlock_irqrestore(&list->lock, flags);
1265}
1266/**
1267 * skb_unlink - remove a buffer from a list
1268 * @skb: buffer to remove
1269 *
1270 * Place a packet after a given packet in a list. The list locks are taken
1271 * and this function is atomic with respect to other list locked calls
1272 *
1273 * Works even without knowing the list it is sitting on, which can be
1274 * handy at times. It also means that THE LIST MUST EXIST when you
1275 * unlink. Thus a list must have its contents unlinked before it is
1276 * destroyed.
1277 */
1278void skb_unlink(struct sk_buff *skb)
1279{
1280 struct sk_buff_head *list = skb->list;
1281
1282 if (list) {
1283 unsigned long flags;
1284
1285 spin_lock_irqsave(&list->lock, flags);
1286 if (skb->list == list)
1287 __skb_unlink(skb, skb->list);
1288 spin_unlock_irqrestore(&list->lock, flags);
1289 }
1290}
1291
1292
1293/**
1294 * skb_append - append a buffer
1295 * @old: buffer to insert after
1296 * @newsk: buffer to insert
1297 *
1298 * Place a packet after a given packet in a list. The list locks are taken
1299 * and this function is atomic with respect to other list locked calls.
1300 * A buffer cannot be placed on two lists at the same time.
1301 */
1302
1303void skb_append(struct sk_buff *old, struct sk_buff *newsk)
1304{
1305 unsigned long flags;
1306
1307 spin_lock_irqsave(&old->list->lock, flags);
1308 __skb_append(old, newsk);
1309 spin_unlock_irqrestore(&old->list->lock, flags);
1310}
1311
1312
1313/**
1314 * skb_insert - insert a buffer
1315 * @old: buffer to insert before
1316 * @newsk: buffer to insert
1317 *
1318 * Place a packet before a given packet in a list. The list locks are taken
1319 * and this function is atomic with respect to other list locked calls
1320 * A buffer cannot be placed on two lists at the same time.
1321 */
1322
1323void skb_insert(struct sk_buff *old, struct sk_buff *newsk)
1324{
1325 unsigned long flags;
1326
1327 spin_lock_irqsave(&old->list->lock, flags);
1328 __skb_insert(newsk, old->prev, old, old->list);
1329 spin_unlock_irqrestore(&old->list->lock, flags);
1330}
1331
1332#if 0
1333/*
1334 * Tune the memory allocator for a new MTU size.
1335 */
1336void skb_add_mtu(int mtu)
1337{
1338 /* Must match allocation in alloc_skb */
1339 mtu = SKB_DATA_ALIGN(mtu) + sizeof(struct skb_shared_info);
1340
1341 kmem_add_cache_size(mtu);
1342}
1343#endif
1344
1345static inline void skb_split_inside_header(struct sk_buff *skb,
1346 struct sk_buff* skb1,
1347 const u32 len, const int pos)
1348{
1349 int i;
1350
1351 memcpy(skb_put(skb1, pos - len), skb->data + len, pos - len);
1352
1353 /* And move data appendix as is. */
1354 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1355 skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
1356
1357 skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
1358 skb_shinfo(skb)->nr_frags = 0;
1359 skb1->data_len = skb->data_len;
1360 skb1->len += skb1->data_len;
1361 skb->data_len = 0;
1362 skb->len = len;
1363 skb->tail = skb->data + len;
1364}
1365
1366static inline void skb_split_no_header(struct sk_buff *skb,
1367 struct sk_buff* skb1,
1368 const u32 len, int pos)
1369{
1370 int i, k = 0;
1371 const int nfrags = skb_shinfo(skb)->nr_frags;
1372
1373 skb_shinfo(skb)->nr_frags = 0;
1374 skb1->len = skb1->data_len = skb->len - len;
1375 skb->len = len;
1376 skb->data_len = len - pos;
1377
1378 for (i = 0; i < nfrags; i++) {
1379 int size = skb_shinfo(skb)->frags[i].size;
1380
1381 if (pos + size > len) {
1382 skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
1383
1384 if (pos < len) {
1385 /* Split frag.
1386 * We have two variants in this case:
1387 * 1. Move all the frag to the second
1388 * part, if it is possible. F.e.
1389 * this approach is mandatory for TUX,
1390 * where splitting is expensive.
1391 * 2. Split is accurately. We make this.
1392 */
1393 get_page(skb_shinfo(skb)->frags[i].page);
1394 skb_shinfo(skb1)->frags[0].page_offset += len - pos;
1395 skb_shinfo(skb1)->frags[0].size -= len - pos;
1396 skb_shinfo(skb)->frags[i].size = len - pos;
1397 skb_shinfo(skb)->nr_frags++;
1398 }
1399 k++;
1400 } else
1401 skb_shinfo(skb)->nr_frags++;
1402 pos += size;
1403 }
1404 skb_shinfo(skb1)->nr_frags = k;
1405}
1406
1407/**
1408 * skb_split - Split fragmented skb to two parts at length len.
1409 * @skb: the buffer to split
1410 * @skb1: the buffer to receive the second part
1411 * @len: new length for skb
1412 */
1413void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
1414{
1415 int pos = skb_headlen(skb);
1416
1417 if (len < pos) /* Split line is inside header. */
1418 skb_split_inside_header(skb, skb1, len, pos);
1419 else /* Second chunk has no header, nothing to copy. */
1420 skb_split_no_header(skb, skb1, len, pos);
1421}
1422
1423void __init skb_init(void)
1424{
1425 skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
1426 sizeof(struct sk_buff),
1427 0,
1428 SLAB_HWCACHE_ALIGN,
1429 NULL, NULL);
1430 if (!skbuff_head_cache)
1431 panic("cannot create skbuff cache");
1432}
1433
1434EXPORT_SYMBOL(___pskb_trim);
1435EXPORT_SYMBOL(__kfree_skb);
1436EXPORT_SYMBOL(__pskb_pull_tail);
1437EXPORT_SYMBOL(alloc_skb);
1438EXPORT_SYMBOL(pskb_copy);
1439EXPORT_SYMBOL(pskb_expand_head);
1440EXPORT_SYMBOL(skb_checksum);
1441EXPORT_SYMBOL(skb_clone);
1442EXPORT_SYMBOL(skb_clone_fraglist);
1443EXPORT_SYMBOL(skb_copy);
1444EXPORT_SYMBOL(skb_copy_and_csum_bits);
1445EXPORT_SYMBOL(skb_copy_and_csum_dev);
1446EXPORT_SYMBOL(skb_copy_bits);
1447EXPORT_SYMBOL(skb_copy_expand);
1448EXPORT_SYMBOL(skb_over_panic);
1449EXPORT_SYMBOL(skb_pad);
1450EXPORT_SYMBOL(skb_realloc_headroom);
1451EXPORT_SYMBOL(skb_under_panic);
1452EXPORT_SYMBOL(skb_dequeue);
1453EXPORT_SYMBOL(skb_dequeue_tail);
1454EXPORT_SYMBOL(skb_insert);
1455EXPORT_SYMBOL(skb_queue_purge);
1456EXPORT_SYMBOL(skb_queue_head);
1457EXPORT_SYMBOL(skb_queue_tail);
1458EXPORT_SYMBOL(skb_unlink);
1459EXPORT_SYMBOL(skb_append);
1460EXPORT_SYMBOL(skb_split);
diff --git a/net/core/sock.c b/net/core/sock.c
new file mode 100644
index 000000000000..629ab4a5b45b
--- /dev/null
+++ b/net/core/sock.c
@@ -0,0 +1,1565 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
10 * Version: $Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
11 *
12 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
13 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Alan Cox, <A.Cox@swansea.ac.uk>
16 *
17 * Fixes:
18 * Alan Cox : Numerous verify_area() problems
19 * Alan Cox : Connecting on a connecting socket
20 * now returns an error for tcp.
21 * Alan Cox : sock->protocol is set correctly.
22 * and is not sometimes left as 0.
23 * Alan Cox : connect handles icmp errors on a
24 * connect properly. Unfortunately there
25 * is a restart syscall nasty there. I
26 * can't match BSD without hacking the C
27 * library. Ideas urgently sought!
28 * Alan Cox : Disallow bind() to addresses that are
29 * not ours - especially broadcast ones!!
30 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
31 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
32 * instead they leave that for the DESTROY timer.
33 * Alan Cox : Clean up error flag in accept
34 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
35 * was buggy. Put a remove_sock() in the handler
36 * for memory when we hit 0. Also altered the timer
37 * code. The ACK stuff can wait and needs major
38 * TCP layer surgery.
39 * Alan Cox : Fixed TCP ack bug, removed remove sock
40 * and fixed timer/inet_bh race.
41 * Alan Cox : Added zapped flag for TCP
42 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
43 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
45 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
46 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47 * Rick Sladkey : Relaxed UDP rules for matching packets.
48 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
49 * Pauline Middelink : identd support
50 * Alan Cox : Fixed connect() taking signals I think.
51 * Alan Cox : SO_LINGER supported
52 * Alan Cox : Error reporting fixes
53 * Anonymous : inet_create tidied up (sk->reuse setting)
54 * Alan Cox : inet sockets don't set sk->type!
55 * Alan Cox : Split socket option code
56 * Alan Cox : Callbacks
57 * Alan Cox : Nagle flag for Charles & Johannes stuff
58 * Alex : Removed restriction on inet fioctl
59 * Alan Cox : Splitting INET from NET core
60 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
61 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
62 * Alan Cox : Split IP from generic code
63 * Alan Cox : New kfree_skbmem()
64 * Alan Cox : Make SO_DEBUG superuser only.
65 * Alan Cox : Allow anyone to clear SO_DEBUG
66 * (compatibility fix)
67 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
68 * Alan Cox : Allocator for a socket is settable.
69 * Alan Cox : SO_ERROR includes soft errors.
70 * Alan Cox : Allow NULL arguments on some SO_ opts
71 * Alan Cox : Generic socket allocation to make hooks
72 * easier (suggested by Craig Metz).
73 * Michael Pall : SO_ERROR returns positive errno again
74 * Steve Whitehouse: Added default destructor to free
75 * protocol private data.
76 * Steve Whitehouse: Added various other default routines
77 * common to several socket families.
78 * Chris Evans : Call suser() check last on F_SETOWN
79 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
81 * Andi Kleen : Fix write_space callback
82 * Chris Evans : Security fixes - signedness again
83 * Arnaldo C. Melo : cleanups, use skb_queue_purge
84 *
85 * To Fix:
86 *
87 *
88 * This program is free software; you can redistribute it and/or
89 * modify it under the terms of the GNU General Public License
90 * as published by the Free Software Foundation; either version
91 * 2 of the License, or (at your option) any later version.
92 */
93
94#include <linux/config.h>
95#include <linux/errno.h>
96#include <linux/types.h>
97#include <linux/socket.h>
98#include <linux/in.h>
99#include <linux/kernel.h>
100#include <linux/major.h>
101#include <linux/module.h>
102#include <linux/proc_fs.h>
103#include <linux/seq_file.h>
104#include <linux/sched.h>
105#include <linux/timer.h>
106#include <linux/string.h>
107#include <linux/sockios.h>
108#include <linux/net.h>
109#include <linux/mm.h>
110#include <linux/slab.h>
111#include <linux/interrupt.h>
112#include <linux/poll.h>
113#include <linux/tcp.h>
114#include <linux/init.h>
115
116#include <asm/uaccess.h>
117#include <asm/system.h>
118
119#include <linux/netdevice.h>
120#include <net/protocol.h>
121#include <linux/skbuff.h>
122#include <net/sock.h>
123#include <net/xfrm.h>
124#include <linux/ipsec.h>
125
126#include <linux/filter.h>
127
128#ifdef CONFIG_INET
129#include <net/tcp.h>
130#endif
131
132/* Take into consideration the size of the struct sk_buff overhead in the
133 * determination of these values, since that is non-constant across
134 * platforms. This makes socket queueing behavior and performance
135 * not depend upon such differences.
136 */
137#define _SK_MEM_PACKETS 256
138#define _SK_MEM_OVERHEAD (sizeof(struct sk_buff) + 256)
139#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
140#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
141
142/* Run time adjustable parameters. */
143__u32 sysctl_wmem_max = SK_WMEM_MAX;
144__u32 sysctl_rmem_max = SK_RMEM_MAX;
145__u32 sysctl_wmem_default = SK_WMEM_MAX;
146__u32 sysctl_rmem_default = SK_RMEM_MAX;
147
148/* Maximal space eaten by iovec or ancilliary data plus some space */
149int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512);
150
151static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
152{
153 struct timeval tv;
154
155 if (optlen < sizeof(tv))
156 return -EINVAL;
157 if (copy_from_user(&tv, optval, sizeof(tv)))
158 return -EFAULT;
159
160 *timeo_p = MAX_SCHEDULE_TIMEOUT;
161 if (tv.tv_sec == 0 && tv.tv_usec == 0)
162 return 0;
163 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
164 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
165 return 0;
166}
167
168static void sock_warn_obsolete_bsdism(const char *name)
169{
170 static int warned;
171 static char warncomm[TASK_COMM_LEN];
172 if (strcmp(warncomm, current->comm) && warned < 5) {
173 strcpy(warncomm, current->comm);
174 printk(KERN_WARNING "process `%s' is using obsolete "
175 "%s SO_BSDCOMPAT\n", warncomm, name);
176 warned++;
177 }
178}
179
180static void sock_disable_timestamp(struct sock *sk)
181{
182 if (sock_flag(sk, SOCK_TIMESTAMP)) {
183 sock_reset_flag(sk, SOCK_TIMESTAMP);
184 net_disable_timestamp();
185 }
186}
187
188
189/*
190 * This is meant for all protocols to use and covers goings on
191 * at the socket level. Everything here is generic.
192 */
193
194int sock_setsockopt(struct socket *sock, int level, int optname,
195 char __user *optval, int optlen)
196{
197 struct sock *sk=sock->sk;
198 struct sk_filter *filter;
199 int val;
200 int valbool;
201 struct linger ling;
202 int ret = 0;
203
204 /*
205 * Options without arguments
206 */
207
208#ifdef SO_DONTLINGER /* Compatibility item... */
209 switch (optname) {
210 case SO_DONTLINGER:
211 sock_reset_flag(sk, SOCK_LINGER);
212 return 0;
213 }
214#endif
215
216 if(optlen<sizeof(int))
217 return(-EINVAL);
218
219 if (get_user(val, (int __user *)optval))
220 return -EFAULT;
221
222 valbool = val?1:0;
223
224 lock_sock(sk);
225
226 switch(optname)
227 {
228 case SO_DEBUG:
229 if(val && !capable(CAP_NET_ADMIN))
230 {
231 ret = -EACCES;
232 }
233 else if (valbool)
234 sock_set_flag(sk, SOCK_DBG);
235 else
236 sock_reset_flag(sk, SOCK_DBG);
237 break;
238 case SO_REUSEADDR:
239 sk->sk_reuse = valbool;
240 break;
241 case SO_TYPE:
242 case SO_ERROR:
243 ret = -ENOPROTOOPT;
244 break;
245 case SO_DONTROUTE:
246 if (valbool)
247 sock_set_flag(sk, SOCK_LOCALROUTE);
248 else
249 sock_reset_flag(sk, SOCK_LOCALROUTE);
250 break;
251 case SO_BROADCAST:
252 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
253 break;
254 case SO_SNDBUF:
255 /* Don't error on this BSD doesn't and if you think
256 about it this is right. Otherwise apps have to
257 play 'guess the biggest size' games. RCVBUF/SNDBUF
258 are treated in BSD as hints */
259
260 if (val > sysctl_wmem_max)
261 val = sysctl_wmem_max;
262
263 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
264 if ((val * 2) < SOCK_MIN_SNDBUF)
265 sk->sk_sndbuf = SOCK_MIN_SNDBUF;
266 else
267 sk->sk_sndbuf = val * 2;
268
269 /*
270 * Wake up sending tasks if we
271 * upped the value.
272 */
273 sk->sk_write_space(sk);
274 break;
275
276 case SO_RCVBUF:
277 /* Don't error on this BSD doesn't and if you think
278 about it this is right. Otherwise apps have to
279 play 'guess the biggest size' games. RCVBUF/SNDBUF
280 are treated in BSD as hints */
281
282 if (val > sysctl_rmem_max)
283 val = sysctl_rmem_max;
284
285 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
286 /* FIXME: is this lower bound the right one? */
287 if ((val * 2) < SOCK_MIN_RCVBUF)
288 sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
289 else
290 sk->sk_rcvbuf = val * 2;
291 break;
292
293 case SO_KEEPALIVE:
294#ifdef CONFIG_INET
295 if (sk->sk_protocol == IPPROTO_TCP)
296 tcp_set_keepalive(sk, valbool);
297#endif
298 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
299 break;
300
301 case SO_OOBINLINE:
302 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
303 break;
304
305 case SO_NO_CHECK:
306 sk->sk_no_check = valbool;
307 break;
308
309 case SO_PRIORITY:
310 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
311 sk->sk_priority = val;
312 else
313 ret = -EPERM;
314 break;
315
316 case SO_LINGER:
317 if(optlen<sizeof(ling)) {
318 ret = -EINVAL; /* 1003.1g */
319 break;
320 }
321 if (copy_from_user(&ling,optval,sizeof(ling))) {
322 ret = -EFAULT;
323 break;
324 }
325 if (!ling.l_onoff)
326 sock_reset_flag(sk, SOCK_LINGER);
327 else {
328#if (BITS_PER_LONG == 32)
329 if (ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
330 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
331 else
332#endif
333 sk->sk_lingertime = ling.l_linger * HZ;
334 sock_set_flag(sk, SOCK_LINGER);
335 }
336 break;
337
338 case SO_BSDCOMPAT:
339 sock_warn_obsolete_bsdism("setsockopt");
340 break;
341
342 case SO_PASSCRED:
343 if (valbool)
344 set_bit(SOCK_PASSCRED, &sock->flags);
345 else
346 clear_bit(SOCK_PASSCRED, &sock->flags);
347 break;
348
349 case SO_TIMESTAMP:
350 if (valbool) {
351 sock_set_flag(sk, SOCK_RCVTSTAMP);
352 sock_enable_timestamp(sk);
353 } else
354 sock_reset_flag(sk, SOCK_RCVTSTAMP);
355 break;
356
357 case SO_RCVLOWAT:
358 if (val < 0)
359 val = INT_MAX;
360 sk->sk_rcvlowat = val ? : 1;
361 break;
362
363 case SO_RCVTIMEO:
364 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
365 break;
366
367 case SO_SNDTIMEO:
368 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
369 break;
370
371#ifdef CONFIG_NETDEVICES
372 case SO_BINDTODEVICE:
373 {
374 char devname[IFNAMSIZ];
375
376 /* Sorry... */
377 if (!capable(CAP_NET_RAW)) {
378 ret = -EPERM;
379 break;
380 }
381
382 /* Bind this socket to a particular device like "eth0",
383 * as specified in the passed interface name. If the
384 * name is "" or the option length is zero the socket
385 * is not bound.
386 */
387
388 if (!valbool) {
389 sk->sk_bound_dev_if = 0;
390 } else {
391 if (optlen > IFNAMSIZ)
392 optlen = IFNAMSIZ;
393 if (copy_from_user(devname, optval, optlen)) {
394 ret = -EFAULT;
395 break;
396 }
397
398 /* Remove any cached route for this socket. */
399 sk_dst_reset(sk);
400
401 if (devname[0] == '\0') {
402 sk->sk_bound_dev_if = 0;
403 } else {
404 struct net_device *dev = dev_get_by_name(devname);
405 if (!dev) {
406 ret = -ENODEV;
407 break;
408 }
409 sk->sk_bound_dev_if = dev->ifindex;
410 dev_put(dev);
411 }
412 }
413 break;
414 }
415#endif
416
417
418 case SO_ATTACH_FILTER:
419 ret = -EINVAL;
420 if (optlen == sizeof(struct sock_fprog)) {
421 struct sock_fprog fprog;
422
423 ret = -EFAULT;
424 if (copy_from_user(&fprog, optval, sizeof(fprog)))
425 break;
426
427 ret = sk_attach_filter(&fprog, sk);
428 }
429 break;
430
431 case SO_DETACH_FILTER:
432 spin_lock_bh(&sk->sk_lock.slock);
433 filter = sk->sk_filter;
434 if (filter) {
435 sk->sk_filter = NULL;
436 spin_unlock_bh(&sk->sk_lock.slock);
437 sk_filter_release(sk, filter);
438 break;
439 }
440 spin_unlock_bh(&sk->sk_lock.slock);
441 ret = -ENONET;
442 break;
443
444 /* We implement the SO_SNDLOWAT etc to
445 not be settable (1003.1g 5.3) */
446 default:
447 ret = -ENOPROTOOPT;
448 break;
449 }
450 release_sock(sk);
451 return ret;
452}
453
454
455int sock_getsockopt(struct socket *sock, int level, int optname,
456 char __user *optval, int __user *optlen)
457{
458 struct sock *sk = sock->sk;
459
460 union
461 {
462 int val;
463 struct linger ling;
464 struct timeval tm;
465 } v;
466
467 unsigned int lv = sizeof(int);
468 int len;
469
470 if(get_user(len,optlen))
471 return -EFAULT;
472 if(len < 0)
473 return -EINVAL;
474
475 switch(optname)
476 {
477 case SO_DEBUG:
478 v.val = sock_flag(sk, SOCK_DBG);
479 break;
480
481 case SO_DONTROUTE:
482 v.val = sock_flag(sk, SOCK_LOCALROUTE);
483 break;
484
485 case SO_BROADCAST:
486 v.val = !!sock_flag(sk, SOCK_BROADCAST);
487 break;
488
489 case SO_SNDBUF:
490 v.val = sk->sk_sndbuf;
491 break;
492
493 case SO_RCVBUF:
494 v.val = sk->sk_rcvbuf;
495 break;
496
497 case SO_REUSEADDR:
498 v.val = sk->sk_reuse;
499 break;
500
501 case SO_KEEPALIVE:
502 v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
503 break;
504
505 case SO_TYPE:
506 v.val = sk->sk_type;
507 break;
508
509 case SO_ERROR:
510 v.val = -sock_error(sk);
511 if(v.val==0)
512 v.val = xchg(&sk->sk_err_soft, 0);
513 break;
514
515 case SO_OOBINLINE:
516 v.val = !!sock_flag(sk, SOCK_URGINLINE);
517 break;
518
519 case SO_NO_CHECK:
520 v.val = sk->sk_no_check;
521 break;
522
523 case SO_PRIORITY:
524 v.val = sk->sk_priority;
525 break;
526
527 case SO_LINGER:
528 lv = sizeof(v.ling);
529 v.ling.l_onoff = !!sock_flag(sk, SOCK_LINGER);
530 v.ling.l_linger = sk->sk_lingertime / HZ;
531 break;
532
533 case SO_BSDCOMPAT:
534 sock_warn_obsolete_bsdism("getsockopt");
535 break;
536
537 case SO_TIMESTAMP:
538 v.val = sock_flag(sk, SOCK_RCVTSTAMP);
539 break;
540
541 case SO_RCVTIMEO:
542 lv=sizeof(struct timeval);
543 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
544 v.tm.tv_sec = 0;
545 v.tm.tv_usec = 0;
546 } else {
547 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
548 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
549 }
550 break;
551
552 case SO_SNDTIMEO:
553 lv=sizeof(struct timeval);
554 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
555 v.tm.tv_sec = 0;
556 v.tm.tv_usec = 0;
557 } else {
558 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
559 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
560 }
561 break;
562
563 case SO_RCVLOWAT:
564 v.val = sk->sk_rcvlowat;
565 break;
566
567 case SO_SNDLOWAT:
568 v.val=1;
569 break;
570
571 case SO_PASSCRED:
572 v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
573 break;
574
575 case SO_PEERCRED:
576 if (len > sizeof(sk->sk_peercred))
577 len = sizeof(sk->sk_peercred);
578 if (copy_to_user(optval, &sk->sk_peercred, len))
579 return -EFAULT;
580 goto lenout;
581
582 case SO_PEERNAME:
583 {
584 char address[128];
585
586 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
587 return -ENOTCONN;
588 if (lv < len)
589 return -EINVAL;
590 if (copy_to_user(optval, address, len))
591 return -EFAULT;
592 goto lenout;
593 }
594
595 /* Dubious BSD thing... Probably nobody even uses it, but
596 * the UNIX standard wants it for whatever reason... -DaveM
597 */
598 case SO_ACCEPTCONN:
599 v.val = sk->sk_state == TCP_LISTEN;
600 break;
601
602 case SO_PEERSEC:
603 return security_socket_getpeersec(sock, optval, optlen, len);
604
605 default:
606 return(-ENOPROTOOPT);
607 }
608 if (len > lv)
609 len = lv;
610 if (copy_to_user(optval, &v, len))
611 return -EFAULT;
612lenout:
613 if (put_user(len, optlen))
614 return -EFAULT;
615 return 0;
616}
617
618/**
619 * sk_alloc - All socket objects are allocated here
620 * @family - protocol family
621 * @priority - for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
622 * @prot - struct proto associated with this new sock instance
623 * @zero_it - if we should zero the newly allocated sock
624 */
625struct sock *sk_alloc(int family, int priority, struct proto *prot, int zero_it)
626{
627 struct sock *sk = NULL;
628 kmem_cache_t *slab = prot->slab;
629
630 if (slab != NULL)
631 sk = kmem_cache_alloc(slab, priority);
632 else
633 sk = kmalloc(prot->obj_size, priority);
634
635 if (sk) {
636 if (zero_it) {
637 memset(sk, 0, prot->obj_size);
638 sk->sk_family = family;
639 sk->sk_prot = prot;
640 sock_lock_init(sk);
641 }
642
643 if (security_sk_alloc(sk, family, priority)) {
644 kmem_cache_free(slab, sk);
645 sk = NULL;
646 } else
647 __module_get(prot->owner);
648 }
649 return sk;
650}
651
652void sk_free(struct sock *sk)
653{
654 struct sk_filter *filter;
655 struct module *owner = sk->sk_prot->owner;
656
657 if (sk->sk_destruct)
658 sk->sk_destruct(sk);
659
660 filter = sk->sk_filter;
661 if (filter) {
662 sk_filter_release(sk, filter);
663 sk->sk_filter = NULL;
664 }
665
666 sock_disable_timestamp(sk);
667
668 if (atomic_read(&sk->sk_omem_alloc))
669 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
670 __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
671
672 security_sk_free(sk);
673 if (sk->sk_prot->slab != NULL)
674 kmem_cache_free(sk->sk_prot->slab, sk);
675 else
676 kfree(sk);
677 module_put(owner);
678}
679
680void __init sk_init(void)
681{
682 if (num_physpages <= 4096) {
683 sysctl_wmem_max = 32767;
684 sysctl_rmem_max = 32767;
685 sysctl_wmem_default = 32767;
686 sysctl_rmem_default = 32767;
687 } else if (num_physpages >= 131072) {
688 sysctl_wmem_max = 131071;
689 sysctl_rmem_max = 131071;
690 }
691}
692
693/*
694 * Simple resource managers for sockets.
695 */
696
697
698/*
699 * Write buffer destructor automatically called from kfree_skb.
700 */
701void sock_wfree(struct sk_buff *skb)
702{
703 struct sock *sk = skb->sk;
704
705 /* In case it might be waiting for more memory. */
706 atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
707 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
708 sk->sk_write_space(sk);
709 sock_put(sk);
710}
711
712/*
713 * Read buffer destructor automatically called from kfree_skb.
714 */
715void sock_rfree(struct sk_buff *skb)
716{
717 struct sock *sk = skb->sk;
718
719 atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
720}
721
722
723int sock_i_uid(struct sock *sk)
724{
725 int uid;
726
727 read_lock(&sk->sk_callback_lock);
728 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
729 read_unlock(&sk->sk_callback_lock);
730 return uid;
731}
732
733unsigned long sock_i_ino(struct sock *sk)
734{
735 unsigned long ino;
736
737 read_lock(&sk->sk_callback_lock);
738 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
739 read_unlock(&sk->sk_callback_lock);
740 return ino;
741}
742
743/*
744 * Allocate a skb from the socket's send buffer.
745 */
746struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int priority)
747{
748 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
749 struct sk_buff * skb = alloc_skb(size, priority);
750 if (skb) {
751 skb_set_owner_w(skb, sk);
752 return skb;
753 }
754 }
755 return NULL;
756}
757
758/*
759 * Allocate a skb from the socket's receive buffer.
760 */
761struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int priority)
762{
763 if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
764 struct sk_buff *skb = alloc_skb(size, priority);
765 if (skb) {
766 skb_set_owner_r(skb, sk);
767 return skb;
768 }
769 }
770 return NULL;
771}
772
773/*
774 * Allocate a memory block from the socket's option memory buffer.
775 */
776void *sock_kmalloc(struct sock *sk, int size, int priority)
777{
778 if ((unsigned)size <= sysctl_optmem_max &&
779 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
780 void *mem;
781 /* First do the add, to avoid the race if kmalloc
782 * might sleep.
783 */
784 atomic_add(size, &sk->sk_omem_alloc);
785 mem = kmalloc(size, priority);
786 if (mem)
787 return mem;
788 atomic_sub(size, &sk->sk_omem_alloc);
789 }
790 return NULL;
791}
792
793/*
794 * Free an option memory block.
795 */
796void sock_kfree_s(struct sock *sk, void *mem, int size)
797{
798 kfree(mem);
799 atomic_sub(size, &sk->sk_omem_alloc);
800}
801
802/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
803 I think, these locks should be removed for datagram sockets.
804 */
805static long sock_wait_for_wmem(struct sock * sk, long timeo)
806{
807 DEFINE_WAIT(wait);
808
809 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
810 for (;;) {
811 if (!timeo)
812 break;
813 if (signal_pending(current))
814 break;
815 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
816 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
817 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
818 break;
819 if (sk->sk_shutdown & SEND_SHUTDOWN)
820 break;
821 if (sk->sk_err)
822 break;
823 timeo = schedule_timeout(timeo);
824 }
825 finish_wait(sk->sk_sleep, &wait);
826 return timeo;
827}
828
829
830/*
831 * Generic send/receive buffer handlers
832 */
833
834static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
835 unsigned long header_len,
836 unsigned long data_len,
837 int noblock, int *errcode)
838{
839 struct sk_buff *skb;
840 unsigned int gfp_mask;
841 long timeo;
842 int err;
843
844 gfp_mask = sk->sk_allocation;
845 if (gfp_mask & __GFP_WAIT)
846 gfp_mask |= __GFP_REPEAT;
847
848 timeo = sock_sndtimeo(sk, noblock);
849 while (1) {
850 err = sock_error(sk);
851 if (err != 0)
852 goto failure;
853
854 err = -EPIPE;
855 if (sk->sk_shutdown & SEND_SHUTDOWN)
856 goto failure;
857
858 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
859 skb = alloc_skb(header_len, sk->sk_allocation);
860 if (skb) {
861 int npages;
862 int i;
863
864 /* No pages, we're done... */
865 if (!data_len)
866 break;
867
868 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
869 skb->truesize += data_len;
870 skb_shinfo(skb)->nr_frags = npages;
871 for (i = 0; i < npages; i++) {
872 struct page *page;
873 skb_frag_t *frag;
874
875 page = alloc_pages(sk->sk_allocation, 0);
876 if (!page) {
877 err = -ENOBUFS;
878 skb_shinfo(skb)->nr_frags = i;
879 kfree_skb(skb);
880 goto failure;
881 }
882
883 frag = &skb_shinfo(skb)->frags[i];
884 frag->page = page;
885 frag->page_offset = 0;
886 frag->size = (data_len >= PAGE_SIZE ?
887 PAGE_SIZE :
888 data_len);
889 data_len -= PAGE_SIZE;
890 }
891
892 /* Full success... */
893 break;
894 }
895 err = -ENOBUFS;
896 goto failure;
897 }
898 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
899 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
900 err = -EAGAIN;
901 if (!timeo)
902 goto failure;
903 if (signal_pending(current))
904 goto interrupted;
905 timeo = sock_wait_for_wmem(sk, timeo);
906 }
907
908 skb_set_owner_w(skb, sk);
909 return skb;
910
911interrupted:
912 err = sock_intr_errno(timeo);
913failure:
914 *errcode = err;
915 return NULL;
916}
917
918struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
919 int noblock, int *errcode)
920{
921 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
922}
923
924static void __lock_sock(struct sock *sk)
925{
926 DEFINE_WAIT(wait);
927
928 for(;;) {
929 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
930 TASK_UNINTERRUPTIBLE);
931 spin_unlock_bh(&sk->sk_lock.slock);
932 schedule();
933 spin_lock_bh(&sk->sk_lock.slock);
934 if(!sock_owned_by_user(sk))
935 break;
936 }
937 finish_wait(&sk->sk_lock.wq, &wait);
938}
939
940static void __release_sock(struct sock *sk)
941{
942 struct sk_buff *skb = sk->sk_backlog.head;
943
944 do {
945 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
946 bh_unlock_sock(sk);
947
948 do {
949 struct sk_buff *next = skb->next;
950
951 skb->next = NULL;
952 sk->sk_backlog_rcv(sk, skb);
953
954 /*
955 * We are in process context here with softirqs
956 * disabled, use cond_resched_softirq() to preempt.
957 * This is safe to do because we've taken the backlog
958 * queue private:
959 */
960 cond_resched_softirq();
961
962 skb = next;
963 } while (skb != NULL);
964
965 bh_lock_sock(sk);
966 } while((skb = sk->sk_backlog.head) != NULL);
967}
968
969/**
970 * sk_wait_data - wait for data to arrive at sk_receive_queue
971 * sk - sock to wait on
972 * timeo - for how long
973 *
974 * Now socket state including sk->sk_err is changed only under lock,
975 * hence we may omit checks after joining wait queue.
976 * We check receive queue before schedule() only as optimization;
977 * it is very likely that release_sock() added new data.
978 */
979int sk_wait_data(struct sock *sk, long *timeo)
980{
981 int rc;
982 DEFINE_WAIT(wait);
983
984 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
985 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
986 rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
987 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
988 finish_wait(sk->sk_sleep, &wait);
989 return rc;
990}
991
992EXPORT_SYMBOL(sk_wait_data);
993
994/*
995 * Set of default routines for initialising struct proto_ops when
996 * the protocol does not support a particular function. In certain
997 * cases where it makes no sense for a protocol to have a "do nothing"
998 * function, some default processing is provided.
999 */
1000
1001int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1002{
1003 return -EOPNOTSUPP;
1004}
1005
1006int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1007 int len, int flags)
1008{
1009 return -EOPNOTSUPP;
1010}
1011
1012int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1013{
1014 return -EOPNOTSUPP;
1015}
1016
1017int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1018{
1019 return -EOPNOTSUPP;
1020}
1021
1022int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1023 int *len, int peer)
1024{
1025 return -EOPNOTSUPP;
1026}
1027
1028unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1029{
1030 return 0;
1031}
1032
1033int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1034{
1035 return -EOPNOTSUPP;
1036}
1037
1038int sock_no_listen(struct socket *sock, int backlog)
1039{
1040 return -EOPNOTSUPP;
1041}
1042
1043int sock_no_shutdown(struct socket *sock, int how)
1044{
1045 return -EOPNOTSUPP;
1046}
1047
1048int sock_no_setsockopt(struct socket *sock, int level, int optname,
1049 char __user *optval, int optlen)
1050{
1051 return -EOPNOTSUPP;
1052}
1053
1054int sock_no_getsockopt(struct socket *sock, int level, int optname,
1055 char __user *optval, int __user *optlen)
1056{
1057 return -EOPNOTSUPP;
1058}
1059
1060int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1061 size_t len)
1062{
1063 return -EOPNOTSUPP;
1064}
1065
1066int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1067 size_t len, int flags)
1068{
1069 return -EOPNOTSUPP;
1070}
1071
1072int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1073{
1074 /* Mirror missing mmap method error code */
1075 return -ENODEV;
1076}
1077
1078ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1079{
1080 ssize_t res;
1081 struct msghdr msg = {.msg_flags = flags};
1082 struct kvec iov;
1083 char *kaddr = kmap(page);
1084 iov.iov_base = kaddr + offset;
1085 iov.iov_len = size;
1086 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1087 kunmap(page);
1088 return res;
1089}
1090
1091/*
1092 * Default Socket Callbacks
1093 */
1094
1095static void sock_def_wakeup(struct sock *sk)
1096{
1097 read_lock(&sk->sk_callback_lock);
1098 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1099 wake_up_interruptible_all(sk->sk_sleep);
1100 read_unlock(&sk->sk_callback_lock);
1101}
1102
1103static void sock_def_error_report(struct sock *sk)
1104{
1105 read_lock(&sk->sk_callback_lock);
1106 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1107 wake_up_interruptible(sk->sk_sleep);
1108 sk_wake_async(sk,0,POLL_ERR);
1109 read_unlock(&sk->sk_callback_lock);
1110}
1111
1112static void sock_def_readable(struct sock *sk, int len)
1113{
1114 read_lock(&sk->sk_callback_lock);
1115 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1116 wake_up_interruptible(sk->sk_sleep);
1117 sk_wake_async(sk,1,POLL_IN);
1118 read_unlock(&sk->sk_callback_lock);
1119}
1120
1121static void sock_def_write_space(struct sock *sk)
1122{
1123 read_lock(&sk->sk_callback_lock);
1124
1125 /* Do not wake up a writer until he can make "significant"
1126 * progress. --DaveM
1127 */
1128 if((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1129 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1130 wake_up_interruptible(sk->sk_sleep);
1131
1132 /* Should agree with poll, otherwise some programs break */
1133 if (sock_writeable(sk))
1134 sk_wake_async(sk, 2, POLL_OUT);
1135 }
1136
1137 read_unlock(&sk->sk_callback_lock);
1138}
1139
1140static void sock_def_destruct(struct sock *sk)
1141{
1142 if (sk->sk_protinfo)
1143 kfree(sk->sk_protinfo);
1144}
1145
1146void sk_send_sigurg(struct sock *sk)
1147{
1148 if (sk->sk_socket && sk->sk_socket->file)
1149 if (send_sigurg(&sk->sk_socket->file->f_owner))
1150 sk_wake_async(sk, 3, POLL_PRI);
1151}
1152
1153void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1154 unsigned long expires)
1155{
1156 if (!mod_timer(timer, expires))
1157 sock_hold(sk);
1158}
1159
1160EXPORT_SYMBOL(sk_reset_timer);
1161
1162void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1163{
1164 if (timer_pending(timer) && del_timer(timer))
1165 __sock_put(sk);
1166}
1167
1168EXPORT_SYMBOL(sk_stop_timer);
1169
1170void sock_init_data(struct socket *sock, struct sock *sk)
1171{
1172 skb_queue_head_init(&sk->sk_receive_queue);
1173 skb_queue_head_init(&sk->sk_write_queue);
1174 skb_queue_head_init(&sk->sk_error_queue);
1175
1176 sk->sk_send_head = NULL;
1177
1178 init_timer(&sk->sk_timer);
1179
1180 sk->sk_allocation = GFP_KERNEL;
1181 sk->sk_rcvbuf = sysctl_rmem_default;
1182 sk->sk_sndbuf = sysctl_wmem_default;
1183 sk->sk_state = TCP_CLOSE;
1184 sk->sk_socket = sock;
1185
1186 sock_set_flag(sk, SOCK_ZAPPED);
1187
1188 if(sock)
1189 {
1190 sk->sk_type = sock->type;
1191 sk->sk_sleep = &sock->wait;
1192 sock->sk = sk;
1193 } else
1194 sk->sk_sleep = NULL;
1195
1196 rwlock_init(&sk->sk_dst_lock);
1197 rwlock_init(&sk->sk_callback_lock);
1198
1199 sk->sk_state_change = sock_def_wakeup;
1200 sk->sk_data_ready = sock_def_readable;
1201 sk->sk_write_space = sock_def_write_space;
1202 sk->sk_error_report = sock_def_error_report;
1203 sk->sk_destruct = sock_def_destruct;
1204
1205 sk->sk_sndmsg_page = NULL;
1206 sk->sk_sndmsg_off = 0;
1207
1208 sk->sk_peercred.pid = 0;
1209 sk->sk_peercred.uid = -1;
1210 sk->sk_peercred.gid = -1;
1211 sk->sk_write_pending = 0;
1212 sk->sk_rcvlowat = 1;
1213 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
1214 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
1215
1216 sk->sk_stamp.tv_sec = -1L;
1217 sk->sk_stamp.tv_usec = -1L;
1218
1219 atomic_set(&sk->sk_refcnt, 1);
1220}
1221
1222void fastcall lock_sock(struct sock *sk)
1223{
1224 might_sleep();
1225 spin_lock_bh(&(sk->sk_lock.slock));
1226 if (sk->sk_lock.owner)
1227 __lock_sock(sk);
1228 sk->sk_lock.owner = (void *)1;
1229 spin_unlock_bh(&(sk->sk_lock.slock));
1230}
1231
1232EXPORT_SYMBOL(lock_sock);
1233
1234void fastcall release_sock(struct sock *sk)
1235{
1236 spin_lock_bh(&(sk->sk_lock.slock));
1237 if (sk->sk_backlog.tail)
1238 __release_sock(sk);
1239 sk->sk_lock.owner = NULL;
1240 if (waitqueue_active(&(sk->sk_lock.wq)))
1241 wake_up(&(sk->sk_lock.wq));
1242 spin_unlock_bh(&(sk->sk_lock.slock));
1243}
1244EXPORT_SYMBOL(release_sock);
1245
1246int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1247{
1248 if (!sock_flag(sk, SOCK_TIMESTAMP))
1249 sock_enable_timestamp(sk);
1250 if (sk->sk_stamp.tv_sec == -1)
1251 return -ENOENT;
1252 if (sk->sk_stamp.tv_sec == 0)
1253 do_gettimeofday(&sk->sk_stamp);
1254 return copy_to_user(userstamp, &sk->sk_stamp, sizeof(struct timeval)) ?
1255 -EFAULT : 0;
1256}
1257EXPORT_SYMBOL(sock_get_timestamp);
1258
1259void sock_enable_timestamp(struct sock *sk)
1260{
1261 if (!sock_flag(sk, SOCK_TIMESTAMP)) {
1262 sock_set_flag(sk, SOCK_TIMESTAMP);
1263 net_enable_timestamp();
1264 }
1265}
1266EXPORT_SYMBOL(sock_enable_timestamp);
1267
1268/*
1269 * Get a socket option on an socket.
1270 *
1271 * FIX: POSIX 1003.1g is very ambiguous here. It states that
1272 * asynchronous errors should be reported by getsockopt. We assume
1273 * this means if you specify SO_ERROR (otherwise whats the point of it).
1274 */
1275int sock_common_getsockopt(struct socket *sock, int level, int optname,
1276 char __user *optval, int __user *optlen)
1277{
1278 struct sock *sk = sock->sk;
1279
1280 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1281}
1282
1283EXPORT_SYMBOL(sock_common_getsockopt);
1284
1285int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1286 struct msghdr *msg, size_t size, int flags)
1287{
1288 struct sock *sk = sock->sk;
1289 int addr_len = 0;
1290 int err;
1291
1292 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1293 flags & ~MSG_DONTWAIT, &addr_len);
1294 if (err >= 0)
1295 msg->msg_namelen = addr_len;
1296 return err;
1297}
1298
1299EXPORT_SYMBOL(sock_common_recvmsg);
1300
1301/*
1302 * Set socket options on an inet socket.
1303 */
1304int sock_common_setsockopt(struct socket *sock, int level, int optname,
1305 char __user *optval, int optlen)
1306{
1307 struct sock *sk = sock->sk;
1308
1309 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1310}
1311
1312EXPORT_SYMBOL(sock_common_setsockopt);
1313
1314void sk_common_release(struct sock *sk)
1315{
1316 if (sk->sk_prot->destroy)
1317 sk->sk_prot->destroy(sk);
1318
1319 /*
1320 * Observation: when sock_common_release is called, processes have
1321 * no access to socket. But net still has.
1322 * Step one, detach it from networking:
1323 *
1324 * A. Remove from hash tables.
1325 */
1326
1327 sk->sk_prot->unhash(sk);
1328
1329 /*
1330 * In this point socket cannot receive new packets, but it is possible
1331 * that some packets are in flight because some CPU runs receiver and
1332 * did hash table lookup before we unhashed socket. They will achieve
1333 * receive queue and will be purged by socket destructor.
1334 *
1335 * Also we still have packets pending on receive queue and probably,
1336 * our own packets waiting in device queues. sock_destroy will drain
1337 * receive queue, but transmitted packets will delay socket destruction
1338 * until the last reference will be released.
1339 */
1340
1341 sock_orphan(sk);
1342
1343 xfrm_sk_free_policy(sk);
1344
1345#ifdef INET_REFCNT_DEBUG
1346 if (atomic_read(&sk->sk_refcnt) != 1)
1347 printk(KERN_DEBUG "Destruction of the socket %p delayed, c=%d\n",
1348 sk, atomic_read(&sk->sk_refcnt));
1349#endif
1350 sock_put(sk);
1351}
1352
1353EXPORT_SYMBOL(sk_common_release);
1354
1355static DEFINE_RWLOCK(proto_list_lock);
1356static LIST_HEAD(proto_list);
1357
1358int proto_register(struct proto *prot, int alloc_slab)
1359{
1360 int rc = -ENOBUFS;
1361
1362 write_lock(&proto_list_lock);
1363
1364 if (alloc_slab) {
1365 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
1366 SLAB_HWCACHE_ALIGN, NULL, NULL);
1367
1368 if (prot->slab == NULL) {
1369 printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
1370 prot->name);
1371 goto out_unlock;
1372 }
1373 }
1374
1375 list_add(&prot->node, &proto_list);
1376 rc = 0;
1377out_unlock:
1378 write_unlock(&proto_list_lock);
1379 return rc;
1380}
1381
1382EXPORT_SYMBOL(proto_register);
1383
1384void proto_unregister(struct proto *prot)
1385{
1386 write_lock(&proto_list_lock);
1387
1388 if (prot->slab != NULL) {
1389 kmem_cache_destroy(prot->slab);
1390 prot->slab = NULL;
1391 }
1392
1393 list_del(&prot->node);
1394 write_unlock(&proto_list_lock);
1395}
1396
1397EXPORT_SYMBOL(proto_unregister);
1398
1399#ifdef CONFIG_PROC_FS
1400static inline struct proto *__proto_head(void)
1401{
1402 return list_entry(proto_list.next, struct proto, node);
1403}
1404
1405static inline struct proto *proto_head(void)
1406{
1407 return list_empty(&proto_list) ? NULL : __proto_head();
1408}
1409
1410static inline struct proto *proto_next(struct proto *proto)
1411{
1412 return proto->node.next == &proto_list ? NULL :
1413 list_entry(proto->node.next, struct proto, node);
1414}
1415
1416static inline struct proto *proto_get_idx(loff_t pos)
1417{
1418 struct proto *proto;
1419 loff_t i = 0;
1420
1421 list_for_each_entry(proto, &proto_list, node)
1422 if (i++ == pos)
1423 goto out;
1424
1425 proto = NULL;
1426out:
1427 return proto;
1428}
1429
1430static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
1431{
1432 read_lock(&proto_list_lock);
1433 return *pos ? proto_get_idx(*pos - 1) : SEQ_START_TOKEN;
1434}
1435
1436static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1437{
1438 ++*pos;
1439 return v == SEQ_START_TOKEN ? proto_head() : proto_next(v);
1440}
1441
1442static void proto_seq_stop(struct seq_file *seq, void *v)
1443{
1444 read_unlock(&proto_list_lock);
1445}
1446
1447static char proto_method_implemented(const void *method)
1448{
1449 return method == NULL ? 'n' : 'y';
1450}
1451
1452static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
1453{
1454 seq_printf(seq, "%-9s %4u %6d %6d %-3s %6u %-3s %-10s "
1455 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
1456 proto->name,
1457 proto->obj_size,
1458 proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
1459 proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
1460 proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
1461 proto->max_header,
1462 proto->slab == NULL ? "no" : "yes",
1463 module_name(proto->owner),
1464 proto_method_implemented(proto->close),
1465 proto_method_implemented(proto->connect),
1466 proto_method_implemented(proto->disconnect),
1467 proto_method_implemented(proto->accept),
1468 proto_method_implemented(proto->ioctl),
1469 proto_method_implemented(proto->init),
1470 proto_method_implemented(proto->destroy),
1471 proto_method_implemented(proto->shutdown),
1472 proto_method_implemented(proto->setsockopt),
1473 proto_method_implemented(proto->getsockopt),
1474 proto_method_implemented(proto->sendmsg),
1475 proto_method_implemented(proto->recvmsg),
1476 proto_method_implemented(proto->sendpage),
1477 proto_method_implemented(proto->bind),
1478 proto_method_implemented(proto->backlog_rcv),
1479 proto_method_implemented(proto->hash),
1480 proto_method_implemented(proto->unhash),
1481 proto_method_implemented(proto->get_port),
1482 proto_method_implemented(proto->enter_memory_pressure));
1483}
1484
1485static int proto_seq_show(struct seq_file *seq, void *v)
1486{
1487 if (v == SEQ_START_TOKEN)
1488 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
1489 "protocol",
1490 "size",
1491 "sockets",
1492 "memory",
1493 "press",
1494 "maxhdr",
1495 "slab",
1496 "module",
1497 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
1498 else
1499 proto_seq_printf(seq, v);
1500 return 0;
1501}
1502
1503static struct seq_operations proto_seq_ops = {
1504 .start = proto_seq_start,
1505 .next = proto_seq_next,
1506 .stop = proto_seq_stop,
1507 .show = proto_seq_show,
1508};
1509
1510static int proto_seq_open(struct inode *inode, struct file *file)
1511{
1512 return seq_open(file, &proto_seq_ops);
1513}
1514
1515static struct file_operations proto_seq_fops = {
1516 .owner = THIS_MODULE,
1517 .open = proto_seq_open,
1518 .read = seq_read,
1519 .llseek = seq_lseek,
1520 .release = seq_release,
1521};
1522
1523static int __init proto_init(void)
1524{
1525 /* register /proc/net/protocols */
1526 return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
1527}
1528
1529subsys_initcall(proto_init);
1530
1531#endif /* PROC_FS */
1532
1533EXPORT_SYMBOL(sk_alloc);
1534EXPORT_SYMBOL(sk_free);
1535EXPORT_SYMBOL(sk_send_sigurg);
1536EXPORT_SYMBOL(sock_alloc_send_skb);
1537EXPORT_SYMBOL(sock_init_data);
1538EXPORT_SYMBOL(sock_kfree_s);
1539EXPORT_SYMBOL(sock_kmalloc);
1540EXPORT_SYMBOL(sock_no_accept);
1541EXPORT_SYMBOL(sock_no_bind);
1542EXPORT_SYMBOL(sock_no_connect);
1543EXPORT_SYMBOL(sock_no_getname);
1544EXPORT_SYMBOL(sock_no_getsockopt);
1545EXPORT_SYMBOL(sock_no_ioctl);
1546EXPORT_SYMBOL(sock_no_listen);
1547EXPORT_SYMBOL(sock_no_mmap);
1548EXPORT_SYMBOL(sock_no_poll);
1549EXPORT_SYMBOL(sock_no_recvmsg);
1550EXPORT_SYMBOL(sock_no_sendmsg);
1551EXPORT_SYMBOL(sock_no_sendpage);
1552EXPORT_SYMBOL(sock_no_setsockopt);
1553EXPORT_SYMBOL(sock_no_shutdown);
1554EXPORT_SYMBOL(sock_no_socketpair);
1555EXPORT_SYMBOL(sock_rfree);
1556EXPORT_SYMBOL(sock_setsockopt);
1557EXPORT_SYMBOL(sock_wfree);
1558EXPORT_SYMBOL(sock_wmalloc);
1559EXPORT_SYMBOL(sock_i_uid);
1560EXPORT_SYMBOL(sock_i_ino);
1561#ifdef CONFIG_SYSCTL
1562EXPORT_SYMBOL(sysctl_optmem_max);
1563EXPORT_SYMBOL(sysctl_rmem_max);
1564EXPORT_SYMBOL(sysctl_wmem_max);
1565#endif
diff --git a/net/core/stream.c b/net/core/stream.c
new file mode 100644
index 000000000000..1e27a57b5a97
--- /dev/null
+++ b/net/core/stream.c
@@ -0,0 +1,287 @@
1/*
2 * SUCS NET3:
3 *
4 * Generic stream handling routines. These are generic for most
5 * protocols. Even IP. Tonight 8-).
6 * This is used because TCP, LLC (others too) layer all have mostly
7 * identical sendmsg() and recvmsg() code.
8 * So we (will) share it here.
9 *
10 * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br>
11 * (from old tcp.c code)
12 * Alan Cox <alan@redhat.com> (Borrowed comments 8-))
13 */
14
15#include <linux/module.h>
16#include <linux/net.h>
17#include <linux/signal.h>
18#include <linux/tcp.h>
19#include <linux/wait.h>
20#include <net/sock.h>
21
22/**
23 * sk_stream_write_space - stream socket write_space callback.
24 * sk - socket
25 *
26 * FIXME: write proper description
27 */
28void sk_stream_write_space(struct sock *sk)
29{
30 struct socket *sock = sk->sk_socket;
31
32 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sock) {
33 clear_bit(SOCK_NOSPACE, &sock->flags);
34
35 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
36 wake_up_interruptible(sk->sk_sleep);
37 if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
38 sock_wake_async(sock, 2, POLL_OUT);
39 }
40}
41
42EXPORT_SYMBOL(sk_stream_write_space);
43
44/**
45 * sk_stream_wait_connect - Wait for a socket to get into the connected state
46 * @sk - sock to wait on
47 * @timeo_p - for how long to wait
48 *
49 * Must be called with the socket locked.
50 */
51int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
52{
53 struct task_struct *tsk = current;
54 DEFINE_WAIT(wait);
55
56 while (1) {
57 if (sk->sk_err)
58 return sock_error(sk);
59 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
60 return -EPIPE;
61 if (!*timeo_p)
62 return -EAGAIN;
63 if (signal_pending(tsk))
64 return sock_intr_errno(*timeo_p);
65
66 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
67 sk->sk_write_pending++;
68 if (sk_wait_event(sk, timeo_p,
69 !((1 << sk->sk_state) &
70 ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))))
71 break;
72 finish_wait(sk->sk_sleep, &wait);
73 sk->sk_write_pending--;
74 }
75 return 0;
76}
77
78EXPORT_SYMBOL(sk_stream_wait_connect);
79
80/**
81 * sk_stream_closing - Return 1 if we still have things to send in our buffers.
82 * @sk - socket to verify
83 */
84static inline int sk_stream_closing(struct sock *sk)
85{
86 return (1 << sk->sk_state) &
87 (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK);
88}
89
90void sk_stream_wait_close(struct sock *sk, long timeout)
91{
92 if (timeout) {
93 DEFINE_WAIT(wait);
94
95 do {
96 prepare_to_wait(sk->sk_sleep, &wait,
97 TASK_INTERRUPTIBLE);
98 if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk)))
99 break;
100 } while (!signal_pending(current) && timeout);
101
102 finish_wait(sk->sk_sleep, &wait);
103 }
104}
105
106EXPORT_SYMBOL(sk_stream_wait_close);
107
108/**
109 * sk_stream_wait_memory - Wait for more memory for a socket
110 * @sk - socket to wait for memory
111 * @timeo_p - for how long
112 */
113int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
114{
115 int err = 0;
116 long vm_wait = 0;
117 long current_timeo = *timeo_p;
118 DEFINE_WAIT(wait);
119
120 if (sk_stream_memory_free(sk))
121 current_timeo = vm_wait = (net_random() % (HZ / 5)) + 2;
122
123 while (1) {
124 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
125
126 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
127
128 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
129 goto do_error;
130 if (!*timeo_p)
131 goto do_nonblock;
132 if (signal_pending(current))
133 goto do_interrupted;
134 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
135 if (sk_stream_memory_free(sk) && !vm_wait)
136 break;
137
138 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
139 sk->sk_write_pending++;
140 sk_wait_event(sk, &current_timeo, sk_stream_memory_free(sk) &&
141 vm_wait);
142 sk->sk_write_pending--;
143
144 if (vm_wait) {
145 vm_wait -= current_timeo;
146 current_timeo = *timeo_p;
147 if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
148 (current_timeo -= vm_wait) < 0)
149 current_timeo = 0;
150 vm_wait = 0;
151 }
152 *timeo_p = current_timeo;
153 }
154out:
155 finish_wait(sk->sk_sleep, &wait);
156 return err;
157
158do_error:
159 err = -EPIPE;
160 goto out;
161do_nonblock:
162 err = -EAGAIN;
163 goto out;
164do_interrupted:
165 err = sock_intr_errno(*timeo_p);
166 goto out;
167}
168
169EXPORT_SYMBOL(sk_stream_wait_memory);
170
171void sk_stream_rfree(struct sk_buff *skb)
172{
173 struct sock *sk = skb->sk;
174
175 atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
176 sk->sk_forward_alloc += skb->truesize;
177}
178
179EXPORT_SYMBOL(sk_stream_rfree);
180
181int sk_stream_error(struct sock *sk, int flags, int err)
182{
183 if (err == -EPIPE)
184 err = sock_error(sk) ? : -EPIPE;
185 if (err == -EPIPE && !(flags & MSG_NOSIGNAL))
186 send_sig(SIGPIPE, current, 0);
187 return err;
188}
189
190EXPORT_SYMBOL(sk_stream_error);
191
192void __sk_stream_mem_reclaim(struct sock *sk)
193{
194 if (sk->sk_forward_alloc >= SK_STREAM_MEM_QUANTUM) {
195 atomic_sub(sk->sk_forward_alloc / SK_STREAM_MEM_QUANTUM,
196 sk->sk_prot->memory_allocated);
197 sk->sk_forward_alloc &= SK_STREAM_MEM_QUANTUM - 1;
198 if (*sk->sk_prot->memory_pressure &&
199 (atomic_read(sk->sk_prot->memory_allocated) <
200 sk->sk_prot->sysctl_mem[0]))
201 *sk->sk_prot->memory_pressure = 0;
202 }
203}
204
205EXPORT_SYMBOL(__sk_stream_mem_reclaim);
206
207int sk_stream_mem_schedule(struct sock *sk, int size, int kind)
208{
209 int amt = sk_stream_pages(size);
210
211 sk->sk_forward_alloc += amt * SK_STREAM_MEM_QUANTUM;
212 atomic_add(amt, sk->sk_prot->memory_allocated);
213
214 /* Under limit. */
215 if (atomic_read(sk->sk_prot->memory_allocated) < sk->sk_prot->sysctl_mem[0]) {
216 if (*sk->sk_prot->memory_pressure)
217 *sk->sk_prot->memory_pressure = 0;
218 return 1;
219 }
220
221 /* Over hard limit. */
222 if (atomic_read(sk->sk_prot->memory_allocated) > sk->sk_prot->sysctl_mem[2]) {
223 sk->sk_prot->enter_memory_pressure();
224 goto suppress_allocation;
225 }
226
227 /* Under pressure. */
228 if (atomic_read(sk->sk_prot->memory_allocated) > sk->sk_prot->sysctl_mem[1])
229 sk->sk_prot->enter_memory_pressure();
230
231 if (kind) {
232 if (atomic_read(&sk->sk_rmem_alloc) < sk->sk_prot->sysctl_rmem[0])
233 return 1;
234 } else if (sk->sk_wmem_queued < sk->sk_prot->sysctl_wmem[0])
235 return 1;
236
237 if (!*sk->sk_prot->memory_pressure ||
238 sk->sk_prot->sysctl_mem[2] > atomic_read(sk->sk_prot->sockets_allocated) *
239 sk_stream_pages(sk->sk_wmem_queued +
240 atomic_read(&sk->sk_rmem_alloc) +
241 sk->sk_forward_alloc))
242 return 1;
243
244suppress_allocation:
245
246 if (!kind) {
247 sk_stream_moderate_sndbuf(sk);
248
249 /* Fail only if socket is _under_ its sndbuf.
250 * In this case we cannot block, so that we have to fail.
251 */
252 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
253 return 1;
254 }
255
256 /* Alas. Undo changes. */
257 sk->sk_forward_alloc -= amt * SK_STREAM_MEM_QUANTUM;
258 atomic_sub(amt, sk->sk_prot->memory_allocated);
259 return 0;
260}
261
262EXPORT_SYMBOL(sk_stream_mem_schedule);
263
264void sk_stream_kill_queues(struct sock *sk)
265{
266 /* First the read buffer. */
267 __skb_queue_purge(&sk->sk_receive_queue);
268
269 /* Next, the error queue. */
270 __skb_queue_purge(&sk->sk_error_queue);
271
272 /* Next, the write queue. */
273 BUG_TRAP(skb_queue_empty(&sk->sk_write_queue));
274
275 /* Account for returned memory. */
276 sk_stream_mem_reclaim(sk);
277
278 BUG_TRAP(!sk->sk_wmem_queued);
279 BUG_TRAP(!sk->sk_forward_alloc);
280
281 /* It is _impossible_ for the backlog to contain anything
282 * when we get here. All user references to this socket
283 * have gone away, only the net layer knows can touch it.
284 */
285}
286
287EXPORT_SYMBOL(sk_stream_kill_queues);
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
new file mode 100644
index 000000000000..c8be646cb191
--- /dev/null
+++ b/net/core/sysctl_net_core.c
@@ -0,0 +1,182 @@
1/* -*- linux-c -*-
2 * sysctl_net_core.c: sysctl interface to net core subsystem.
3 *
4 * Begun April 1, 1996, Mike Shaver.
5 * Added /proc/sys/net/core directory entry (empty =) ). [MS]
6 */
7
8#include <linux/mm.h>
9#include <linux/sysctl.h>
10#include <linux/config.h>
11#include <linux/module.h>
12
13#ifdef CONFIG_SYSCTL
14
15extern int netdev_max_backlog;
16extern int weight_p;
17extern int no_cong_thresh;
18extern int no_cong;
19extern int lo_cong;
20extern int mod_cong;
21extern int netdev_fastroute;
22extern int net_msg_cost;
23extern int net_msg_burst;
24
25extern __u32 sysctl_wmem_max;
26extern __u32 sysctl_rmem_max;
27extern __u32 sysctl_wmem_default;
28extern __u32 sysctl_rmem_default;
29
30extern int sysctl_core_destroy_delay;
31extern int sysctl_optmem_max;
32extern int sysctl_somaxconn;
33
34#ifdef CONFIG_NET_DIVERT
35extern char sysctl_divert_version[];
36#endif /* CONFIG_NET_DIVERT */
37
38/*
39 * This strdup() is used for creating copies of network
40 * device names to be handed over to sysctl.
41 */
42
43char *net_sysctl_strdup(const char *s)
44{
45 char *rv = kmalloc(strlen(s)+1, GFP_KERNEL);
46 if (rv)
47 strcpy(rv, s);
48 return rv;
49}
50
51ctl_table core_table[] = {
52#ifdef CONFIG_NET
53 {
54 .ctl_name = NET_CORE_WMEM_MAX,
55 .procname = "wmem_max",
56 .data = &sysctl_wmem_max,
57 .maxlen = sizeof(int),
58 .mode = 0644,
59 .proc_handler = &proc_dointvec
60 },
61 {
62 .ctl_name = NET_CORE_RMEM_MAX,
63 .procname = "rmem_max",
64 .data = &sysctl_rmem_max,
65 .maxlen = sizeof(int),
66 .mode = 0644,
67 .proc_handler = &proc_dointvec
68 },
69 {
70 .ctl_name = NET_CORE_WMEM_DEFAULT,
71 .procname = "wmem_default",
72 .data = &sysctl_wmem_default,
73 .maxlen = sizeof(int),
74 .mode = 0644,
75 .proc_handler = &proc_dointvec
76 },
77 {
78 .ctl_name = NET_CORE_RMEM_DEFAULT,
79 .procname = "rmem_default",
80 .data = &sysctl_rmem_default,
81 .maxlen = sizeof(int),
82 .mode = 0644,
83 .proc_handler = &proc_dointvec
84 },
85 {
86 .ctl_name = NET_CORE_DEV_WEIGHT,
87 .procname = "dev_weight",
88 .data = &weight_p,
89 .maxlen = sizeof(int),
90 .mode = 0644,
91 .proc_handler = &proc_dointvec
92 },
93 {
94 .ctl_name = NET_CORE_MAX_BACKLOG,
95 .procname = "netdev_max_backlog",
96 .data = &netdev_max_backlog,
97 .maxlen = sizeof(int),
98 .mode = 0644,
99 .proc_handler = &proc_dointvec
100 },
101 {
102 .ctl_name = NET_CORE_NO_CONG_THRESH,
103 .procname = "no_cong_thresh",
104 .data = &no_cong_thresh,
105 .maxlen = sizeof(int),
106 .mode = 0644,
107 .proc_handler = &proc_dointvec
108 },
109 {
110 .ctl_name = NET_CORE_NO_CONG,
111 .procname = "no_cong",
112 .data = &no_cong,
113 .maxlen = sizeof(int),
114 .mode = 0644,
115 .proc_handler = &proc_dointvec
116 },
117 {
118 .ctl_name = NET_CORE_LO_CONG,
119 .procname = "lo_cong",
120 .data = &lo_cong,
121 .maxlen = sizeof(int),
122 .mode = 0644,
123 .proc_handler = &proc_dointvec
124 },
125 {
126 .ctl_name = NET_CORE_MOD_CONG,
127 .procname = "mod_cong",
128 .data = &mod_cong,
129 .maxlen = sizeof(int),
130 .mode = 0644,
131 .proc_handler = &proc_dointvec
132 },
133 {
134 .ctl_name = NET_CORE_MSG_COST,
135 .procname = "message_cost",
136 .data = &net_msg_cost,
137 .maxlen = sizeof(int),
138 .mode = 0644,
139 .proc_handler = &proc_dointvec_jiffies,
140 .strategy = &sysctl_jiffies,
141 },
142 {
143 .ctl_name = NET_CORE_MSG_BURST,
144 .procname = "message_burst",
145 .data = &net_msg_burst,
146 .maxlen = sizeof(int),
147 .mode = 0644,
148 .proc_handler = &proc_dointvec,
149 },
150 {
151 .ctl_name = NET_CORE_OPTMEM_MAX,
152 .procname = "optmem_max",
153 .data = &sysctl_optmem_max,
154 .maxlen = sizeof(int),
155 .mode = 0644,
156 .proc_handler = &proc_dointvec
157 },
158#ifdef CONFIG_NET_DIVERT
159 {
160 .ctl_name = NET_CORE_DIVERT_VERSION,
161 .procname = "divert_version",
162 .data = (void *)sysctl_divert_version,
163 .maxlen = 32,
164 .mode = 0444,
165 .proc_handler = &proc_dostring
166 },
167#endif /* CONFIG_NET_DIVERT */
168#endif /* CONFIG_NET */
169 {
170 .ctl_name = NET_CORE_SOMAXCONN,
171 .procname = "somaxconn",
172 .data = &sysctl_somaxconn,
173 .maxlen = sizeof(int),
174 .mode = 0644,
175 .proc_handler = &proc_dointvec
176 },
177 { .ctl_name = 0 }
178};
179
180EXPORT_SYMBOL(net_sysctl_strdup);
181
182#endif
diff --git a/net/core/utils.c b/net/core/utils.c
new file mode 100644
index 000000000000..e11a8654f363
--- /dev/null
+++ b/net/core/utils.c
@@ -0,0 +1,155 @@
1/*
2 * Generic address resultion entity
3 *
4 * Authors:
5 * net_random Alan Cox
6 * net_ratelimit Andy Kleen
7 *
8 * Created by Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 */
15
16#include <linux/module.h>
17#include <linux/jiffies.h>
18#include <linux/kernel.h>
19#include <linux/mm.h>
20#include <linux/string.h>
21#include <linux/types.h>
22#include <linux/random.h>
23#include <linux/percpu.h>
24#include <linux/init.h>
25
26#include <asm/system.h>
27#include <asm/uaccess.h>
28
29
30/*
31 This is a maximally equidistributed combined Tausworthe generator
32 based on code from GNU Scientific Library 1.5 (30 Jun 2004)
33
34 x_n = (s1_n ^ s2_n ^ s3_n)
35
36 s1_{n+1} = (((s1_n & 4294967294) <<12) ^ (((s1_n <<13) ^ s1_n) >>19))
37 s2_{n+1} = (((s2_n & 4294967288) << 4) ^ (((s2_n << 2) ^ s2_n) >>25))
38 s3_{n+1} = (((s3_n & 4294967280) <<17) ^ (((s3_n << 3) ^ s3_n) >>11))
39
40 The period of this generator is about 2^88.
41
42 From: P. L'Ecuyer, "Maximally Equidistributed Combined Tausworthe
43 Generators", Mathematics of Computation, 65, 213 (1996), 203--213.
44
45 This is available on the net from L'Ecuyer's home page,
46
47 http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme.ps
48 ftp://ftp.iro.umontreal.ca/pub/simulation/lecuyer/papers/tausme.ps
49
50 There is an erratum in the paper "Tables of Maximally
51 Equidistributed Combined LFSR Generators", Mathematics of
52 Computation, 68, 225 (1999), 261--269:
53 http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme2.ps
54
55 ... the k_j most significant bits of z_j must be non-
56 zero, for each j. (Note: this restriction also applies to the
57 computer code given in [4], but was mistakenly not mentioned in
58 that paper.)
59
60 This affects the seeding procedure by imposing the requirement
61 s1 > 1, s2 > 7, s3 > 15.
62
63*/
64struct nrnd_state {
65 u32 s1, s2, s3;
66};
67
68static DEFINE_PER_CPU(struct nrnd_state, net_rand_state);
69
70static u32 __net_random(struct nrnd_state *state)
71{
72#define TAUSWORTHE(s,a,b,c,d) ((s&c)<<d) ^ (((s <<a) ^ s)>>b)
73
74 state->s1 = TAUSWORTHE(state->s1, 13, 19, 4294967294UL, 12);
75 state->s2 = TAUSWORTHE(state->s2, 2, 25, 4294967288UL, 4);
76 state->s3 = TAUSWORTHE(state->s3, 3, 11, 4294967280UL, 17);
77
78 return (state->s1 ^ state->s2 ^ state->s3);
79}
80
81static void __net_srandom(struct nrnd_state *state, unsigned long s)
82{
83 if (s == 0)
84 s = 1; /* default seed is 1 */
85
86#define LCG(n) (69069 * n)
87 state->s1 = LCG(s);
88 state->s2 = LCG(state->s1);
89 state->s3 = LCG(state->s2);
90
91 /* "warm it up" */
92 __net_random(state);
93 __net_random(state);
94 __net_random(state);
95 __net_random(state);
96 __net_random(state);
97 __net_random(state);
98}
99
100
101unsigned long net_random(void)
102{
103 unsigned long r;
104 struct nrnd_state *state = &get_cpu_var(net_rand_state);
105 r = __net_random(state);
106 put_cpu_var(state);
107 return r;
108}
109
110
111void net_srandom(unsigned long entropy)
112{
113 struct nrnd_state *state = &get_cpu_var(net_rand_state);
114 __net_srandom(state, state->s1^entropy);
115 put_cpu_var(state);
116}
117
118void __init net_random_init(void)
119{
120 int i;
121
122 for (i = 0; i < NR_CPUS; i++) {
123 struct nrnd_state *state = &per_cpu(net_rand_state,i);
124 __net_srandom(state, i+jiffies);
125 }
126}
127
128static int net_random_reseed(void)
129{
130 int i;
131 unsigned long seed[NR_CPUS];
132
133 get_random_bytes(seed, sizeof(seed));
134 for (i = 0; i < NR_CPUS; i++) {
135 struct nrnd_state *state = &per_cpu(net_rand_state,i);
136 __net_srandom(state, seed[i]);
137 }
138 return 0;
139}
140late_initcall(net_random_reseed);
141
142int net_msg_cost = 5*HZ;
143int net_msg_burst = 10;
144
145/*
146 * All net warning printk()s should be guarded by this function.
147 */
148int net_ratelimit(void)
149{
150 return __printk_ratelimit(net_msg_cost, net_msg_burst);
151}
152
153EXPORT_SYMBOL(net_random);
154EXPORT_SYMBOL(net_ratelimit);
155EXPORT_SYMBOL(net_srandom);
diff --git a/net/core/wireless.c b/net/core/wireless.c
new file mode 100644
index 000000000000..750cc5daeb03
--- /dev/null
+++ b/net/core/wireless.c
@@ -0,0 +1,1459 @@
1/*
2 * This file implement the Wireless Extensions APIs.
3 *
4 * Authors : Jean Tourrilhes - HPL - <jt@hpl.hp.com>
5 * Copyright (c) 1997-2004 Jean Tourrilhes, All Rights Reserved.
6 *
7 * (As all part of the Linux kernel, this file is GPL)
8 */
9
10/************************** DOCUMENTATION **************************/
11/*
12 * API definition :
13 * --------------
14 * See <linux/wireless.h> for details of the APIs and the rest.
15 *
16 * History :
17 * -------
18 *
19 * v1 - 5.12.01 - Jean II
20 * o Created this file.
21 *
22 * v2 - 13.12.01 - Jean II
23 * o Move /proc/net/wireless stuff from net/core/dev.c to here
24 * o Make Wireless Extension IOCTLs go through here
25 * o Added iw_handler handling ;-)
26 * o Added standard ioctl description
27 * o Initial dumb commit strategy based on orinoco.c
28 *
29 * v3 - 19.12.01 - Jean II
30 * o Make sure we don't go out of standard_ioctl[] in ioctl_standard_call
31 * o Add event dispatcher function
32 * o Add event description
33 * o Propagate events as rtnetlink IFLA_WIRELESS option
34 * o Generate event on selected SET requests
35 *
36 * v4 - 18.04.02 - Jean II
37 * o Fix stupid off by one in iw_ioctl_description : IW_ESSID_MAX_SIZE + 1
38 *
39 * v5 - 21.06.02 - Jean II
40 * o Add IW_PRIV_TYPE_ADDR in priv_type_size (+cleanup)
41 * o Reshuffle IW_HEADER_TYPE_XXX to map IW_PRIV_TYPE_XXX changes
42 * o Add IWEVCUSTOM for driver specific event/scanning token
43 * o Turn on WE_STRICT_WRITE by default + kernel warning
44 * o Fix WE_STRICT_WRITE in ioctl_export_private() (32 => iw_num)
45 * o Fix off-by-one in test (extra_size <= IFNAMSIZ)
46 *
47 * v6 - 9.01.03 - Jean II
48 * o Add common spy support : iw_handler_set_spy(), wireless_spy_update()
49 * o Add enhanced spy support : iw_handler_set_thrspy() and event.
50 * o Add WIRELESS_EXT version display in /proc/net/wireless
51 *
52 * v6 - 18.06.04 - Jean II
53 * o Change get_spydata() method for added safety
54 * o Remove spy #ifdef, they are always on -> cleaner code
55 * o Allow any size GET request if user specifies length > max
56 * and if request has IW_DESCR_FLAG_NOMAX flag or is SIOCGIWPRIV
57 * o Start migrating get_wireless_stats to struct iw_handler_def
58 * o Add wmb() in iw_handler_set_spy() for non-coherent archs/cpus
59 * Based on patch from Pavel Roskin <proski@gnu.org> :
60 * o Fix kernel data leak to user space in private handler handling
61 */
62
63/***************************** INCLUDES *****************************/
64
65#include <linux/config.h> /* Not needed ??? */
66#include <linux/module.h>
67#include <linux/types.h> /* off_t */
68#include <linux/netdevice.h> /* struct ifreq, dev_get_by_name() */
69#include <linux/proc_fs.h>
70#include <linux/rtnetlink.h> /* rtnetlink stuff */
71#include <linux/seq_file.h>
72#include <linux/init.h> /* for __init */
73#include <linux/if_arp.h> /* ARPHRD_ETHER */
74
75#include <linux/wireless.h> /* Pretty obvious */
76#include <net/iw_handler.h> /* New driver API */
77
78#include <asm/uaccess.h> /* copy_to_user() */
79
80/**************************** CONSTANTS ****************************/
81
82/* Debugging stuff */
83#undef WE_IOCTL_DEBUG /* Debug IOCTL API */
84#undef WE_EVENT_DEBUG /* Debug Event dispatcher */
85#undef WE_SPY_DEBUG /* Debug enhanced spy support */
86
87/* Options */
88#define WE_EVENT_NETLINK /* Propagate events using rtnetlink */
89#define WE_SET_EVENT /* Generate an event on some set commands */
90
91/************************* GLOBAL VARIABLES *************************/
92/*
93 * You should not use global variables, because of re-entrancy.
94 * On our case, it's only const, so it's OK...
95 */
96/*
97 * Meta-data about all the standard Wireless Extension request we
98 * know about.
99 */
100static const struct iw_ioctl_description standard_ioctl[] = {
101 [SIOCSIWCOMMIT - SIOCIWFIRST] = {
102 .header_type = IW_HEADER_TYPE_NULL,
103 },
104 [SIOCGIWNAME - SIOCIWFIRST] = {
105 .header_type = IW_HEADER_TYPE_CHAR,
106 .flags = IW_DESCR_FLAG_DUMP,
107 },
108 [SIOCSIWNWID - SIOCIWFIRST] = {
109 .header_type = IW_HEADER_TYPE_PARAM,
110 .flags = IW_DESCR_FLAG_EVENT,
111 },
112 [SIOCGIWNWID - SIOCIWFIRST] = {
113 .header_type = IW_HEADER_TYPE_PARAM,
114 .flags = IW_DESCR_FLAG_DUMP,
115 },
116 [SIOCSIWFREQ - SIOCIWFIRST] = {
117 .header_type = IW_HEADER_TYPE_FREQ,
118 .flags = IW_DESCR_FLAG_EVENT,
119 },
120 [SIOCGIWFREQ - SIOCIWFIRST] = {
121 .header_type = IW_HEADER_TYPE_FREQ,
122 .flags = IW_DESCR_FLAG_DUMP,
123 },
124 [SIOCSIWMODE - SIOCIWFIRST] = {
125 .header_type = IW_HEADER_TYPE_UINT,
126 .flags = IW_DESCR_FLAG_EVENT,
127 },
128 [SIOCGIWMODE - SIOCIWFIRST] = {
129 .header_type = IW_HEADER_TYPE_UINT,
130 .flags = IW_DESCR_FLAG_DUMP,
131 },
132 [SIOCSIWSENS - SIOCIWFIRST] = {
133 .header_type = IW_HEADER_TYPE_PARAM,
134 },
135 [SIOCGIWSENS - SIOCIWFIRST] = {
136 .header_type = IW_HEADER_TYPE_PARAM,
137 },
138 [SIOCSIWRANGE - SIOCIWFIRST] = {
139 .header_type = IW_HEADER_TYPE_NULL,
140 },
141 [SIOCGIWRANGE - SIOCIWFIRST] = {
142 .header_type = IW_HEADER_TYPE_POINT,
143 .token_size = 1,
144 .max_tokens = sizeof(struct iw_range),
145 .flags = IW_DESCR_FLAG_DUMP,
146 },
147 [SIOCSIWPRIV - SIOCIWFIRST] = {
148 .header_type = IW_HEADER_TYPE_NULL,
149 },
150 [SIOCGIWPRIV - SIOCIWFIRST] = { /* (handled directly by us) */
151 .header_type = IW_HEADER_TYPE_NULL,
152 },
153 [SIOCSIWSTATS - SIOCIWFIRST] = {
154 .header_type = IW_HEADER_TYPE_NULL,
155 },
156 [SIOCGIWSTATS - SIOCIWFIRST] = { /* (handled directly by us) */
157 .header_type = IW_HEADER_TYPE_NULL,
158 .flags = IW_DESCR_FLAG_DUMP,
159 },
160 [SIOCSIWSPY - SIOCIWFIRST] = {
161 .header_type = IW_HEADER_TYPE_POINT,
162 .token_size = sizeof(struct sockaddr),
163 .max_tokens = IW_MAX_SPY,
164 },
165 [SIOCGIWSPY - SIOCIWFIRST] = {
166 .header_type = IW_HEADER_TYPE_POINT,
167 .token_size = sizeof(struct sockaddr) +
168 sizeof(struct iw_quality),
169 .max_tokens = IW_MAX_SPY,
170 },
171 [SIOCSIWTHRSPY - SIOCIWFIRST] = {
172 .header_type = IW_HEADER_TYPE_POINT,
173 .token_size = sizeof(struct iw_thrspy),
174 .min_tokens = 1,
175 .max_tokens = 1,
176 },
177 [SIOCGIWTHRSPY - SIOCIWFIRST] = {
178 .header_type = IW_HEADER_TYPE_POINT,
179 .token_size = sizeof(struct iw_thrspy),
180 .min_tokens = 1,
181 .max_tokens = 1,
182 },
183 [SIOCSIWAP - SIOCIWFIRST] = {
184 .header_type = IW_HEADER_TYPE_ADDR,
185 },
186 [SIOCGIWAP - SIOCIWFIRST] = {
187 .header_type = IW_HEADER_TYPE_ADDR,
188 .flags = IW_DESCR_FLAG_DUMP,
189 },
190 [SIOCGIWAPLIST - SIOCIWFIRST] = {
191 .header_type = IW_HEADER_TYPE_POINT,
192 .token_size = sizeof(struct sockaddr) +
193 sizeof(struct iw_quality),
194 .max_tokens = IW_MAX_AP,
195 .flags = IW_DESCR_FLAG_NOMAX,
196 },
197 [SIOCSIWSCAN - SIOCIWFIRST] = {
198 .header_type = IW_HEADER_TYPE_PARAM,
199 },
200 [SIOCGIWSCAN - SIOCIWFIRST] = {
201 .header_type = IW_HEADER_TYPE_POINT,
202 .token_size = 1,
203 .max_tokens = IW_SCAN_MAX_DATA,
204 .flags = IW_DESCR_FLAG_NOMAX,
205 },
206 [SIOCSIWESSID - SIOCIWFIRST] = {
207 .header_type = IW_HEADER_TYPE_POINT,
208 .token_size = 1,
209 .max_tokens = IW_ESSID_MAX_SIZE + 1,
210 .flags = IW_DESCR_FLAG_EVENT,
211 },
212 [SIOCGIWESSID - SIOCIWFIRST] = {
213 .header_type = IW_HEADER_TYPE_POINT,
214 .token_size = 1,
215 .max_tokens = IW_ESSID_MAX_SIZE + 1,
216 .flags = IW_DESCR_FLAG_DUMP,
217 },
218 [SIOCSIWNICKN - SIOCIWFIRST] = {
219 .header_type = IW_HEADER_TYPE_POINT,
220 .token_size = 1,
221 .max_tokens = IW_ESSID_MAX_SIZE + 1,
222 },
223 [SIOCGIWNICKN - SIOCIWFIRST] = {
224 .header_type = IW_HEADER_TYPE_POINT,
225 .token_size = 1,
226 .max_tokens = IW_ESSID_MAX_SIZE + 1,
227 },
228 [SIOCSIWRATE - SIOCIWFIRST] = {
229 .header_type = IW_HEADER_TYPE_PARAM,
230 },
231 [SIOCGIWRATE - SIOCIWFIRST] = {
232 .header_type = IW_HEADER_TYPE_PARAM,
233 },
234 [SIOCSIWRTS - SIOCIWFIRST] = {
235 .header_type = IW_HEADER_TYPE_PARAM,
236 },
237 [SIOCGIWRTS - SIOCIWFIRST] = {
238 .header_type = IW_HEADER_TYPE_PARAM,
239 },
240 [SIOCSIWFRAG - SIOCIWFIRST] = {
241 .header_type = IW_HEADER_TYPE_PARAM,
242 },
243 [SIOCGIWFRAG - SIOCIWFIRST] = {
244 .header_type = IW_HEADER_TYPE_PARAM,
245 },
246 [SIOCSIWTXPOW - SIOCIWFIRST] = {
247 .header_type = IW_HEADER_TYPE_PARAM,
248 },
249 [SIOCGIWTXPOW - SIOCIWFIRST] = {
250 .header_type = IW_HEADER_TYPE_PARAM,
251 },
252 [SIOCSIWRETRY - SIOCIWFIRST] = {
253 .header_type = IW_HEADER_TYPE_PARAM,
254 },
255 [SIOCGIWRETRY - SIOCIWFIRST] = {
256 .header_type = IW_HEADER_TYPE_PARAM,
257 },
258 [SIOCSIWENCODE - SIOCIWFIRST] = {
259 .header_type = IW_HEADER_TYPE_POINT,
260 .token_size = 1,
261 .max_tokens = IW_ENCODING_TOKEN_MAX,
262 .flags = IW_DESCR_FLAG_EVENT | IW_DESCR_FLAG_RESTRICT,
263 },
264 [SIOCGIWENCODE - SIOCIWFIRST] = {
265 .header_type = IW_HEADER_TYPE_POINT,
266 .token_size = 1,
267 .max_tokens = IW_ENCODING_TOKEN_MAX,
268 .flags = IW_DESCR_FLAG_DUMP | IW_DESCR_FLAG_RESTRICT,
269 },
270 [SIOCSIWPOWER - SIOCIWFIRST] = {
271 .header_type = IW_HEADER_TYPE_PARAM,
272 },
273 [SIOCGIWPOWER - SIOCIWFIRST] = {
274 .header_type = IW_HEADER_TYPE_PARAM,
275 },
276};
277static const int standard_ioctl_num = (sizeof(standard_ioctl) /
278 sizeof(struct iw_ioctl_description));
279
280/*
281 * Meta-data about all the additional standard Wireless Extension events
282 * we know about.
283 */
284static const struct iw_ioctl_description standard_event[] = {
285 [IWEVTXDROP - IWEVFIRST] = {
286 .header_type = IW_HEADER_TYPE_ADDR,
287 },
288 [IWEVQUAL - IWEVFIRST] = {
289 .header_type = IW_HEADER_TYPE_QUAL,
290 },
291 [IWEVCUSTOM - IWEVFIRST] = {
292 .header_type = IW_HEADER_TYPE_POINT,
293 .token_size = 1,
294 .max_tokens = IW_CUSTOM_MAX,
295 },
296 [IWEVREGISTERED - IWEVFIRST] = {
297 .header_type = IW_HEADER_TYPE_ADDR,
298 },
299 [IWEVEXPIRED - IWEVFIRST] = {
300 .header_type = IW_HEADER_TYPE_ADDR,
301 },
302};
303static const int standard_event_num = (sizeof(standard_event) /
304 sizeof(struct iw_ioctl_description));
305
306/* Size (in bytes) of the various private data types */
307static const char iw_priv_type_size[] = {
308 0, /* IW_PRIV_TYPE_NONE */
309 1, /* IW_PRIV_TYPE_BYTE */
310 1, /* IW_PRIV_TYPE_CHAR */
311 0, /* Not defined */
312 sizeof(__u32), /* IW_PRIV_TYPE_INT */
313 sizeof(struct iw_freq), /* IW_PRIV_TYPE_FLOAT */
314 sizeof(struct sockaddr), /* IW_PRIV_TYPE_ADDR */
315 0, /* Not defined */
316};
317
318/* Size (in bytes) of various events */
319static const int event_type_size[] = {
320 IW_EV_LCP_LEN, /* IW_HEADER_TYPE_NULL */
321 0,
322 IW_EV_CHAR_LEN, /* IW_HEADER_TYPE_CHAR */
323 0,
324 IW_EV_UINT_LEN, /* IW_HEADER_TYPE_UINT */
325 IW_EV_FREQ_LEN, /* IW_HEADER_TYPE_FREQ */
326 IW_EV_ADDR_LEN, /* IW_HEADER_TYPE_ADDR */
327 0,
328 IW_EV_POINT_LEN, /* Without variable payload */
329 IW_EV_PARAM_LEN, /* IW_HEADER_TYPE_PARAM */
330 IW_EV_QUAL_LEN, /* IW_HEADER_TYPE_QUAL */
331};
332
333/************************ COMMON SUBROUTINES ************************/
334/*
335 * Stuff that may be used in various place or doesn't fit in one
336 * of the section below.
337 */
338
339/* ---------------------------------------------------------------- */
340/*
341 * Return the driver handler associated with a specific Wireless Extension.
342 * Called from various place, so make sure it remains efficient.
343 */
344static inline iw_handler get_handler(struct net_device *dev,
345 unsigned int cmd)
346{
347 /* Don't "optimise" the following variable, it will crash */
348 unsigned int index; /* *MUST* be unsigned */
349
350 /* Check if we have some wireless handlers defined */
351 if(dev->wireless_handlers == NULL)
352 return NULL;
353
354 /* Try as a standard command */
355 index = cmd - SIOCIWFIRST;
356 if(index < dev->wireless_handlers->num_standard)
357 return dev->wireless_handlers->standard[index];
358
359 /* Try as a private command */
360 index = cmd - SIOCIWFIRSTPRIV;
361 if(index < dev->wireless_handlers->num_private)
362 return dev->wireless_handlers->private[index];
363
364 /* Not found */
365 return NULL;
366}
367
368/* ---------------------------------------------------------------- */
369/*
370 * Get statistics out of the driver
371 */
372static inline struct iw_statistics *get_wireless_stats(struct net_device *dev)
373{
374 /* New location */
375 if((dev->wireless_handlers != NULL) &&
376 (dev->wireless_handlers->get_wireless_stats != NULL))
377 return dev->wireless_handlers->get_wireless_stats(dev);
378
379 /* Old location, will be phased out in next WE */
380 return (dev->get_wireless_stats ?
381 dev->get_wireless_stats(dev) :
382 (struct iw_statistics *) NULL);
383}
384
385/* ---------------------------------------------------------------- */
386/*
387 * Call the commit handler in the driver
388 * (if exist and if conditions are right)
389 *
390 * Note : our current commit strategy is currently pretty dumb,
391 * but we will be able to improve on that...
392 * The goal is to try to agreagate as many changes as possible
393 * before doing the commit. Drivers that will define a commit handler
394 * are usually those that need a reset after changing parameters, so
395 * we want to minimise the number of reset.
396 * A cool idea is to use a timer : at each "set" command, we re-set the
397 * timer, when the timer eventually fires, we call the driver.
398 * Hopefully, more on that later.
399 *
400 * Also, I'm waiting to see how many people will complain about the
401 * netif_running(dev) test. I'm open on that one...
402 * Hopefully, the driver will remember to do a commit in "open()" ;-)
403 */
404static inline int call_commit_handler(struct net_device * dev)
405{
406 if((netif_running(dev)) &&
407 (dev->wireless_handlers->standard[0] != NULL)) {
408 /* Call the commit handler on the driver */
409 return dev->wireless_handlers->standard[0](dev, NULL,
410 NULL, NULL);
411 } else
412 return 0; /* Command completed successfully */
413}
414
415/* ---------------------------------------------------------------- */
416/*
417 * Calculate size of private arguments
418 */
419static inline int get_priv_size(__u16 args)
420{
421 int num = args & IW_PRIV_SIZE_MASK;
422 int type = (args & IW_PRIV_TYPE_MASK) >> 12;
423
424 return num * iw_priv_type_size[type];
425}
426
427/* ---------------------------------------------------------------- */
428/*
429 * Re-calculate the size of private arguments
430 */
431static inline int adjust_priv_size(__u16 args,
432 union iwreq_data * wrqu)
433{
434 int num = wrqu->data.length;
435 int max = args & IW_PRIV_SIZE_MASK;
436 int type = (args & IW_PRIV_TYPE_MASK) >> 12;
437
438 /* Make sure the driver doesn't goof up */
439 if (max < num)
440 num = max;
441
442 return num * iw_priv_type_size[type];
443}
444
445
446/******************** /proc/net/wireless SUPPORT ********************/
447/*
448 * The /proc/net/wireless file is a human readable user-space interface
449 * exporting various wireless specific statistics from the wireless devices.
450 * This is the most popular part of the Wireless Extensions ;-)
451 *
452 * This interface is a pure clone of /proc/net/dev (in net/core/dev.c).
453 * The content of the file is basically the content of "struct iw_statistics".
454 */
455
456#ifdef CONFIG_PROC_FS
457
458/* ---------------------------------------------------------------- */
459/*
460 * Print one entry (line) of /proc/net/wireless
461 */
462static __inline__ void wireless_seq_printf_stats(struct seq_file *seq,
463 struct net_device *dev)
464{
465 /* Get stats from the driver */
466 struct iw_statistics *stats = get_wireless_stats(dev);
467
468 if (stats) {
469 seq_printf(seq, "%6s: %04x %3d%c %3d%c %3d%c %6d %6d %6d "
470 "%6d %6d %6d\n",
471 dev->name, stats->status, stats->qual.qual,
472 stats->qual.updated & IW_QUAL_QUAL_UPDATED
473 ? '.' : ' ',
474 ((__u8) stats->qual.level),
475 stats->qual.updated & IW_QUAL_LEVEL_UPDATED
476 ? '.' : ' ',
477 ((__u8) stats->qual.noise),
478 stats->qual.updated & IW_QUAL_NOISE_UPDATED
479 ? '.' : ' ',
480 stats->discard.nwid, stats->discard.code,
481 stats->discard.fragment, stats->discard.retries,
482 stats->discard.misc, stats->miss.beacon);
483 stats->qual.updated = 0;
484 }
485}
486
487/* ---------------------------------------------------------------- */
488/*
489 * Print info for /proc/net/wireless (print all entries)
490 */
491static int wireless_seq_show(struct seq_file *seq, void *v)
492{
493 if (v == SEQ_START_TOKEN)
494 seq_printf(seq, "Inter-| sta-| Quality | Discarded "
495 "packets | Missed | WE\n"
496 " face | tus | link level noise | nwid "
497 "crypt frag retry misc | beacon | %d\n",
498 WIRELESS_EXT);
499 else
500 wireless_seq_printf_stats(seq, v);
501 return 0;
502}
503
504extern void *dev_seq_start(struct seq_file *seq, loff_t *pos);
505extern void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos);
506extern void dev_seq_stop(struct seq_file *seq, void *v);
507
508static struct seq_operations wireless_seq_ops = {
509 .start = dev_seq_start,
510 .next = dev_seq_next,
511 .stop = dev_seq_stop,
512 .show = wireless_seq_show,
513};
514
515static int wireless_seq_open(struct inode *inode, struct file *file)
516{
517 return seq_open(file, &wireless_seq_ops);
518}
519
520static struct file_operations wireless_seq_fops = {
521 .owner = THIS_MODULE,
522 .open = wireless_seq_open,
523 .read = seq_read,
524 .llseek = seq_lseek,
525 .release = seq_release,
526};
527
528int __init wireless_proc_init(void)
529{
530 if (!proc_net_fops_create("wireless", S_IRUGO, &wireless_seq_fops))
531 return -ENOMEM;
532
533 return 0;
534}
535#endif /* CONFIG_PROC_FS */
536
537/************************** IOCTL SUPPORT **************************/
538/*
539 * The original user space API to configure all those Wireless Extensions
540 * is through IOCTLs.
541 * In there, we check if we need to call the new driver API (iw_handler)
542 * or just call the driver ioctl handler.
543 */
544
545/* ---------------------------------------------------------------- */
546/*
547 * Allow programatic access to /proc/net/wireless even if /proc
548 * doesn't exist... Also more efficient...
549 */
550static inline int dev_iwstats(struct net_device *dev, struct ifreq *ifr)
551{
552 /* Get stats from the driver */
553 struct iw_statistics *stats;
554
555 stats = get_wireless_stats(dev);
556 if (stats != (struct iw_statistics *) NULL) {
557 struct iwreq * wrq = (struct iwreq *)ifr;
558
559 /* Copy statistics to the user buffer */
560 if(copy_to_user(wrq->u.data.pointer, stats,
561 sizeof(struct iw_statistics)))
562 return -EFAULT;
563
564 /* Check if we need to clear the update flag */
565 if(wrq->u.data.flags != 0)
566 stats->qual.updated = 0;
567 return 0;
568 } else
569 return -EOPNOTSUPP;
570}
571
572/* ---------------------------------------------------------------- */
573/*
574 * Export the driver private handler definition
575 * They will be picked up by tools like iwpriv...
576 */
577static inline int ioctl_export_private(struct net_device * dev,
578 struct ifreq * ifr)
579{
580 struct iwreq * iwr = (struct iwreq *) ifr;
581
582 /* Check if the driver has something to export */
583 if((dev->wireless_handlers->num_private_args == 0) ||
584 (dev->wireless_handlers->private_args == NULL))
585 return -EOPNOTSUPP;
586
587 /* Check NULL pointer */
588 if(iwr->u.data.pointer == NULL)
589 return -EFAULT;
590
591 /* Check if there is enough buffer up there */
592 if(iwr->u.data.length < dev->wireless_handlers->num_private_args) {
593 /* User space can't know in advance how large the buffer
594 * needs to be. Give it a hint, so that we can support
595 * any size buffer we want somewhat efficiently... */
596 iwr->u.data.length = dev->wireless_handlers->num_private_args;
597 return -E2BIG;
598 }
599
600 /* Set the number of available ioctls. */
601 iwr->u.data.length = dev->wireless_handlers->num_private_args;
602
603 /* Copy structure to the user buffer. */
604 if (copy_to_user(iwr->u.data.pointer,
605 dev->wireless_handlers->private_args,
606 sizeof(struct iw_priv_args) * iwr->u.data.length))
607 return -EFAULT;
608
609 return 0;
610}
611
612/* ---------------------------------------------------------------- */
613/*
614 * Wrapper to call a standard Wireless Extension handler.
615 * We do various checks and also take care of moving data between
616 * user space and kernel space.
617 */
618static inline int ioctl_standard_call(struct net_device * dev,
619 struct ifreq * ifr,
620 unsigned int cmd,
621 iw_handler handler)
622{
623 struct iwreq * iwr = (struct iwreq *) ifr;
624 const struct iw_ioctl_description * descr;
625 struct iw_request_info info;
626 int ret = -EINVAL;
627
628 /* Get the description of the IOCTL */
629 if((cmd - SIOCIWFIRST) >= standard_ioctl_num)
630 return -EOPNOTSUPP;
631 descr = &(standard_ioctl[cmd - SIOCIWFIRST]);
632
633#ifdef WE_IOCTL_DEBUG
634 printk(KERN_DEBUG "%s (WE) : Found standard handler for 0x%04X\n",
635 ifr->ifr_name, cmd);
636 printk(KERN_DEBUG "%s (WE) : Header type : %d, Token type : %d, size : %d, token : %d\n", dev->name, descr->header_type, descr->token_type, descr->token_size, descr->max_tokens);
637#endif /* WE_IOCTL_DEBUG */
638
639 /* Prepare the call */
640 info.cmd = cmd;
641 info.flags = 0;
642
643 /* Check if we have a pointer to user space data or not */
644 if(descr->header_type != IW_HEADER_TYPE_POINT) {
645
646 /* No extra arguments. Trivial to handle */
647 ret = handler(dev, &info, &(iwr->u), NULL);
648
649#ifdef WE_SET_EVENT
650 /* Generate an event to notify listeners of the change */
651 if((descr->flags & IW_DESCR_FLAG_EVENT) &&
652 ((ret == 0) || (ret == -EIWCOMMIT)))
653 wireless_send_event(dev, cmd, &(iwr->u), NULL);
654#endif /* WE_SET_EVENT */
655 } else {
656 char * extra;
657 int extra_size;
658 int user_length = 0;
659 int err;
660
661 /* Calculate space needed by arguments. Always allocate
662 * for max space. Easier, and won't last long... */
663 extra_size = descr->max_tokens * descr->token_size;
664
665 /* Check what user space is giving us */
666 if(IW_IS_SET(cmd)) {
667 /* Check NULL pointer */
668 if((iwr->u.data.pointer == NULL) &&
669 (iwr->u.data.length != 0))
670 return -EFAULT;
671 /* Check if number of token fits within bounds */
672 if(iwr->u.data.length > descr->max_tokens)
673 return -E2BIG;
674 if(iwr->u.data.length < descr->min_tokens)
675 return -EINVAL;
676 } else {
677 /* Check NULL pointer */
678 if(iwr->u.data.pointer == NULL)
679 return -EFAULT;
680 /* Save user space buffer size for checking */
681 user_length = iwr->u.data.length;
682
683 /* Don't check if user_length > max to allow forward
684 * compatibility. The test user_length < min is
685 * implied by the test at the end. */
686
687 /* Support for very large requests */
688 if((descr->flags & IW_DESCR_FLAG_NOMAX) &&
689 (user_length > descr->max_tokens)) {
690 /* Allow userspace to GET more than max so
691 * we can support any size GET requests.
692 * There is still a limit : -ENOMEM. */
693 extra_size = user_length * descr->token_size;
694 /* Note : user_length is originally a __u16,
695 * and token_size is controlled by us,
696 * so extra_size won't get negative and
697 * won't overflow... */
698 }
699 }
700
701#ifdef WE_IOCTL_DEBUG
702 printk(KERN_DEBUG "%s (WE) : Malloc %d bytes\n",
703 dev->name, extra_size);
704#endif /* WE_IOCTL_DEBUG */
705
706 /* Create the kernel buffer */
707 extra = kmalloc(extra_size, GFP_KERNEL);
708 if (extra == NULL) {
709 return -ENOMEM;
710 }
711
712 /* If it is a SET, get all the extra data in here */
713 if(IW_IS_SET(cmd) && (iwr->u.data.length != 0)) {
714 err = copy_from_user(extra, iwr->u.data.pointer,
715 iwr->u.data.length *
716 descr->token_size);
717 if (err) {
718 kfree(extra);
719 return -EFAULT;
720 }
721#ifdef WE_IOCTL_DEBUG
722 printk(KERN_DEBUG "%s (WE) : Got %d bytes\n",
723 dev->name,
724 iwr->u.data.length * descr->token_size);
725#endif /* WE_IOCTL_DEBUG */
726 }
727
728 /* Call the handler */
729 ret = handler(dev, &info, &(iwr->u), extra);
730
731 /* If we have something to return to the user */
732 if (!ret && IW_IS_GET(cmd)) {
733 /* Check if there is enough buffer up there */
734 if(user_length < iwr->u.data.length) {
735 kfree(extra);
736 return -E2BIG;
737 }
738
739 err = copy_to_user(iwr->u.data.pointer, extra,
740 iwr->u.data.length *
741 descr->token_size);
742 if (err)
743 ret = -EFAULT;
744#ifdef WE_IOCTL_DEBUG
745 printk(KERN_DEBUG "%s (WE) : Wrote %d bytes\n",
746 dev->name,
747 iwr->u.data.length * descr->token_size);
748#endif /* WE_IOCTL_DEBUG */
749 }
750
751#ifdef WE_SET_EVENT
752 /* Generate an event to notify listeners of the change */
753 if((descr->flags & IW_DESCR_FLAG_EVENT) &&
754 ((ret == 0) || (ret == -EIWCOMMIT))) {
755 if(descr->flags & IW_DESCR_FLAG_RESTRICT)
756 /* If the event is restricted, don't
757 * export the payload */
758 wireless_send_event(dev, cmd, &(iwr->u), NULL);
759 else
760 wireless_send_event(dev, cmd, &(iwr->u),
761 extra);
762 }
763#endif /* WE_SET_EVENT */
764
765 /* Cleanup - I told you it wasn't that long ;-) */
766 kfree(extra);
767 }
768
769 /* Call commit handler if needed and defined */
770 if(ret == -EIWCOMMIT)
771 ret = call_commit_handler(dev);
772
773 /* Here, we will generate the appropriate event if needed */
774
775 return ret;
776}
777
778/* ---------------------------------------------------------------- */
779/*
780 * Wrapper to call a private Wireless Extension handler.
781 * We do various checks and also take care of moving data between
782 * user space and kernel space.
783 * It's not as nice and slimline as the standard wrapper. The cause
784 * is struct iw_priv_args, which was not really designed for the
785 * job we are going here.
786 *
787 * IMPORTANT : This function prevent to set and get data on the same
788 * IOCTL and enforce the SET/GET convention. Not doing it would be
789 * far too hairy...
790 * If you need to set and get data at the same time, please don't use
791 * a iw_handler but process it in your ioctl handler (i.e. use the
792 * old driver API).
793 */
794static inline int ioctl_private_call(struct net_device * dev,
795 struct ifreq * ifr,
796 unsigned int cmd,
797 iw_handler handler)
798{
799 struct iwreq * iwr = (struct iwreq *) ifr;
800 const struct iw_priv_args * descr = NULL;
801 struct iw_request_info info;
802 int extra_size = 0;
803 int i;
804 int ret = -EINVAL;
805
806 /* Get the description of the IOCTL */
807 for(i = 0; i < dev->wireless_handlers->num_private_args; i++)
808 if(cmd == dev->wireless_handlers->private_args[i].cmd) {
809 descr = &(dev->wireless_handlers->private_args[i]);
810 break;
811 }
812
813#ifdef WE_IOCTL_DEBUG
814 printk(KERN_DEBUG "%s (WE) : Found private handler for 0x%04X\n",
815 ifr->ifr_name, cmd);
816 if(descr) {
817 printk(KERN_DEBUG "%s (WE) : Name %s, set %X, get %X\n",
818 dev->name, descr->name,
819 descr->set_args, descr->get_args);
820 }
821#endif /* WE_IOCTL_DEBUG */
822
823 /* Compute the size of the set/get arguments */
824 if(descr != NULL) {
825 if(IW_IS_SET(cmd)) {
826 int offset = 0; /* For sub-ioctls */
827 /* Check for sub-ioctl handler */
828 if(descr->name[0] == '\0')
829 /* Reserve one int for sub-ioctl index */
830 offset = sizeof(__u32);
831
832 /* Size of set arguments */
833 extra_size = get_priv_size(descr->set_args);
834
835 /* Does it fits in iwr ? */
836 if((descr->set_args & IW_PRIV_SIZE_FIXED) &&
837 ((extra_size + offset) <= IFNAMSIZ))
838 extra_size = 0;
839 } else {
840 /* Size of get arguments */
841 extra_size = get_priv_size(descr->get_args);
842
843 /* Does it fits in iwr ? */
844 if((descr->get_args & IW_PRIV_SIZE_FIXED) &&
845 (extra_size <= IFNAMSIZ))
846 extra_size = 0;
847 }
848 }
849
850 /* Prepare the call */
851 info.cmd = cmd;
852 info.flags = 0;
853
854 /* Check if we have a pointer to user space data or not. */
855 if(extra_size == 0) {
856 /* No extra arguments. Trivial to handle */
857 ret = handler(dev, &info, &(iwr->u), (char *) &(iwr->u));
858 } else {
859 char * extra;
860 int err;
861
862 /* Check what user space is giving us */
863 if(IW_IS_SET(cmd)) {
864 /* Check NULL pointer */
865 if((iwr->u.data.pointer == NULL) &&
866 (iwr->u.data.length != 0))
867 return -EFAULT;
868
869 /* Does it fits within bounds ? */
870 if(iwr->u.data.length > (descr->set_args &
871 IW_PRIV_SIZE_MASK))
872 return -E2BIG;
873 } else {
874 /* Check NULL pointer */
875 if(iwr->u.data.pointer == NULL)
876 return -EFAULT;
877 }
878
879#ifdef WE_IOCTL_DEBUG
880 printk(KERN_DEBUG "%s (WE) : Malloc %d bytes\n",
881 dev->name, extra_size);
882#endif /* WE_IOCTL_DEBUG */
883
884 /* Always allocate for max space. Easier, and won't last
885 * long... */
886 extra = kmalloc(extra_size, GFP_KERNEL);
887 if (extra == NULL) {
888 return -ENOMEM;
889 }
890
891 /* If it is a SET, get all the extra data in here */
892 if(IW_IS_SET(cmd) && (iwr->u.data.length != 0)) {
893 err = copy_from_user(extra, iwr->u.data.pointer,
894 extra_size);
895 if (err) {
896 kfree(extra);
897 return -EFAULT;
898 }
899#ifdef WE_IOCTL_DEBUG
900 printk(KERN_DEBUG "%s (WE) : Got %d elem\n",
901 dev->name, iwr->u.data.length);
902#endif /* WE_IOCTL_DEBUG */
903 }
904
905 /* Call the handler */
906 ret = handler(dev, &info, &(iwr->u), extra);
907
908 /* If we have something to return to the user */
909 if (!ret && IW_IS_GET(cmd)) {
910
911 /* Adjust for the actual length if it's variable,
912 * avoid leaking kernel bits outside. */
913 if (!(descr->get_args & IW_PRIV_SIZE_FIXED)) {
914 extra_size = adjust_priv_size(descr->get_args,
915 &(iwr->u));
916 }
917
918 err = copy_to_user(iwr->u.data.pointer, extra,
919 extra_size);
920 if (err)
921 ret = -EFAULT;
922#ifdef WE_IOCTL_DEBUG
923 printk(KERN_DEBUG "%s (WE) : Wrote %d elem\n",
924 dev->name, iwr->u.data.length);
925#endif /* WE_IOCTL_DEBUG */
926 }
927
928 /* Cleanup - I told you it wasn't that long ;-) */
929 kfree(extra);
930 }
931
932
933 /* Call commit handler if needed and defined */
934 if(ret == -EIWCOMMIT)
935 ret = call_commit_handler(dev);
936
937 return ret;
938}
939
940/* ---------------------------------------------------------------- */
941/*
942 * Main IOCTl dispatcher. Called from the main networking code
943 * (dev_ioctl() in net/core/dev.c).
944 * Check the type of IOCTL and call the appropriate wrapper...
945 */
946int wireless_process_ioctl(struct ifreq *ifr, unsigned int cmd)
947{
948 struct net_device *dev;
949 iw_handler handler;
950
951 /* Permissions are already checked in dev_ioctl() before calling us.
952 * The copy_to/from_user() of ifr is also dealt with in there */
953
954 /* Make sure the device exist */
955 if ((dev = __dev_get_by_name(ifr->ifr_name)) == NULL)
956 return -ENODEV;
957
958 /* A bunch of special cases, then the generic case...
959 * Note that 'cmd' is already filtered in dev_ioctl() with
960 * (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) */
961 switch(cmd)
962 {
963 case SIOCGIWSTATS:
964 /* Get Wireless Stats */
965 return dev_iwstats(dev, ifr);
966
967 case SIOCGIWPRIV:
968 /* Check if we have some wireless handlers defined */
969 if(dev->wireless_handlers != NULL) {
970 /* We export to user space the definition of
971 * the private handler ourselves */
972 return ioctl_export_private(dev, ifr);
973 }
974 // ## Fall-through for old API ##
975 default:
976 /* Generic IOCTL */
977 /* Basic check */
978 if (!netif_device_present(dev))
979 return -ENODEV;
980 /* New driver API : try to find the handler */
981 handler = get_handler(dev, cmd);
982 if(handler != NULL) {
983 /* Standard and private are not the same */
984 if(cmd < SIOCIWFIRSTPRIV)
985 return ioctl_standard_call(dev,
986 ifr,
987 cmd,
988 handler);
989 else
990 return ioctl_private_call(dev,
991 ifr,
992 cmd,
993 handler);
994 }
995 /* Old driver API : call driver ioctl handler */
996 if (dev->do_ioctl) {
997 return dev->do_ioctl(dev, ifr, cmd);
998 }
999 return -EOPNOTSUPP;
1000 }
1001 /* Not reached */
1002 return -EINVAL;
1003}
1004
1005/************************* EVENT PROCESSING *************************/
1006/*
1007 * Process events generated by the wireless layer or the driver.
1008 * Most often, the event will be propagated through rtnetlink
1009 */
1010
1011#ifdef WE_EVENT_NETLINK
1012/* "rtnl" is defined in net/core/rtnetlink.c, but we need it here.
1013 * It is declared in <linux/rtnetlink.h> */
1014
1015/* ---------------------------------------------------------------- */
1016/*
1017 * Fill a rtnetlink message with our event data.
1018 * Note that we propage only the specified event and don't dump the
1019 * current wireless config. Dumping the wireless config is far too
1020 * expensive (for each parameter, the driver need to query the hardware).
1021 */
1022static inline int rtnetlink_fill_iwinfo(struct sk_buff * skb,
1023 struct net_device * dev,
1024 int type,
1025 char * event,
1026 int event_len)
1027{
1028 struct ifinfomsg *r;
1029 struct nlmsghdr *nlh;
1030 unsigned char *b = skb->tail;
1031
1032 nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(*r));
1033 r = NLMSG_DATA(nlh);
1034 r->ifi_family = AF_UNSPEC;
1035 r->ifi_type = dev->type;
1036 r->ifi_index = dev->ifindex;
1037 r->ifi_flags = dev->flags;
1038 r->ifi_change = 0; /* Wireless changes don't affect those flags */
1039
1040 /* Add the wireless events in the netlink packet */
1041 RTA_PUT(skb, IFLA_WIRELESS,
1042 event_len, event);
1043
1044 nlh->nlmsg_len = skb->tail - b;
1045 return skb->len;
1046
1047nlmsg_failure:
1048rtattr_failure:
1049 skb_trim(skb, b - skb->data);
1050 return -1;
1051}
1052
1053/* ---------------------------------------------------------------- */
1054/*
1055 * Create and broadcast and send it on the standard rtnetlink socket
1056 * This is a pure clone rtmsg_ifinfo() in net/core/rtnetlink.c
1057 * Andrzej Krzysztofowicz mandated that I used a IFLA_XXX field
1058 * within a RTM_NEWLINK event.
1059 */
1060static inline void rtmsg_iwinfo(struct net_device * dev,
1061 char * event,
1062 int event_len)
1063{
1064 struct sk_buff *skb;
1065 int size = NLMSG_GOODSIZE;
1066
1067 skb = alloc_skb(size, GFP_ATOMIC);
1068 if (!skb)
1069 return;
1070
1071 if (rtnetlink_fill_iwinfo(skb, dev, RTM_NEWLINK,
1072 event, event_len) < 0) {
1073 kfree_skb(skb);
1074 return;
1075 }
1076 NETLINK_CB(skb).dst_groups = RTMGRP_LINK;
1077 netlink_broadcast(rtnl, skb, 0, RTMGRP_LINK, GFP_ATOMIC);
1078}
1079#endif /* WE_EVENT_NETLINK */
1080
1081/* ---------------------------------------------------------------- */
1082/*
1083 * Main event dispatcher. Called from other parts and drivers.
1084 * Send the event on the appropriate channels.
1085 * May be called from interrupt context.
1086 */
1087void wireless_send_event(struct net_device * dev,
1088 unsigned int cmd,
1089 union iwreq_data * wrqu,
1090 char * extra)
1091{
1092 const struct iw_ioctl_description * descr = NULL;
1093 int extra_len = 0;
1094 struct iw_event *event; /* Mallocated whole event */
1095 int event_len; /* Its size */
1096 int hdr_len; /* Size of the event header */
1097 /* Don't "optimise" the following variable, it will crash */
1098 unsigned cmd_index; /* *MUST* be unsigned */
1099
1100 /* Get the description of the IOCTL */
1101 if(cmd <= SIOCIWLAST) {
1102 cmd_index = cmd - SIOCIWFIRST;
1103 if(cmd_index < standard_ioctl_num)
1104 descr = &(standard_ioctl[cmd_index]);
1105 } else {
1106 cmd_index = cmd - IWEVFIRST;
1107 if(cmd_index < standard_event_num)
1108 descr = &(standard_event[cmd_index]);
1109 }
1110 /* Don't accept unknown events */
1111 if(descr == NULL) {
1112 /* Note : we don't return an error to the driver, because
1113 * the driver would not know what to do about it. It can't
1114 * return an error to the user, because the event is not
1115 * initiated by a user request.
1116 * The best the driver could do is to log an error message.
1117 * We will do it ourselves instead...
1118 */
1119 printk(KERN_ERR "%s (WE) : Invalid/Unknown Wireless Event (0x%04X)\n",
1120 dev->name, cmd);
1121 return;
1122 }
1123#ifdef WE_EVENT_DEBUG
1124 printk(KERN_DEBUG "%s (WE) : Got event 0x%04X\n",
1125 dev->name, cmd);
1126 printk(KERN_DEBUG "%s (WE) : Header type : %d, Token type : %d, size : %d, token : %d\n", dev->name, descr->header_type, descr->token_type, descr->token_size, descr->max_tokens);
1127#endif /* WE_EVENT_DEBUG */
1128
1129 /* Check extra parameters and set extra_len */
1130 if(descr->header_type == IW_HEADER_TYPE_POINT) {
1131 /* Check if number of token fits within bounds */
1132 if(wrqu->data.length > descr->max_tokens) {
1133 printk(KERN_ERR "%s (WE) : Wireless Event too big (%d)\n", dev->name, wrqu->data.length);
1134 return;
1135 }
1136 if(wrqu->data.length < descr->min_tokens) {
1137 printk(KERN_ERR "%s (WE) : Wireless Event too small (%d)\n", dev->name, wrqu->data.length);
1138 return;
1139 }
1140 /* Calculate extra_len - extra is NULL for restricted events */
1141 if(extra != NULL)
1142 extra_len = wrqu->data.length * descr->token_size;
1143#ifdef WE_EVENT_DEBUG
1144 printk(KERN_DEBUG "%s (WE) : Event 0x%04X, tokens %d, extra_len %d\n", dev->name, cmd, wrqu->data.length, extra_len);
1145#endif /* WE_EVENT_DEBUG */
1146 }
1147
1148 /* Total length of the event */
1149 hdr_len = event_type_size[descr->header_type];
1150 event_len = hdr_len + extra_len;
1151
1152#ifdef WE_EVENT_DEBUG
1153 printk(KERN_DEBUG "%s (WE) : Event 0x%04X, hdr_len %d, event_len %d\n", dev->name, cmd, hdr_len, event_len);
1154#endif /* WE_EVENT_DEBUG */
1155
1156 /* Create temporary buffer to hold the event */
1157 event = kmalloc(event_len, GFP_ATOMIC);
1158 if(event == NULL)
1159 return;
1160
1161 /* Fill event */
1162 event->len = event_len;
1163 event->cmd = cmd;
1164 memcpy(&event->u, wrqu, hdr_len - IW_EV_LCP_LEN);
1165 if(extra != NULL)
1166 memcpy(((char *) event) + hdr_len, extra, extra_len);
1167
1168#ifdef WE_EVENT_NETLINK
1169 /* rtnetlink event channel */
1170 rtmsg_iwinfo(dev, (char *) event, event_len);
1171#endif /* WE_EVENT_NETLINK */
1172
1173 /* Cleanup */
1174 kfree(event);
1175
1176 return; /* Always success, I guess ;-) */
1177}
1178
1179/********************** ENHANCED IWSPY SUPPORT **********************/
1180/*
1181 * In the old days, the driver was handling spy support all by itself.
1182 * Now, the driver can delegate this task to Wireless Extensions.
1183 * It needs to use those standard spy iw_handler in struct iw_handler_def,
1184 * push data to us via wireless_spy_update() and include struct iw_spy_data
1185 * in its private part (and advertise it in iw_handler_def->spy_offset).
1186 * One of the main advantage of centralising spy support here is that
1187 * it becomes much easier to improve and extend it without having to touch
1188 * the drivers. One example is the addition of the Spy-Threshold events.
1189 */
1190
1191/* ---------------------------------------------------------------- */
1192/*
1193 * Return the pointer to the spy data in the driver.
1194 * Because this is called on the Rx path via wireless_spy_update(),
1195 * we want it to be efficient...
1196 */
1197static inline struct iw_spy_data * get_spydata(struct net_device *dev)
1198{
1199 /* This is the new way */
1200 if(dev->wireless_data)
1201 return(dev->wireless_data->spy_data);
1202
1203 /* This is the old way. Doesn't work for multi-headed drivers.
1204 * It will be removed in the next version of WE. */
1205 return (dev->priv + dev->wireless_handlers->spy_offset);
1206}
1207
1208/*------------------------------------------------------------------*/
1209/*
1210 * Standard Wireless Handler : set Spy List
1211 */
1212int iw_handler_set_spy(struct net_device * dev,
1213 struct iw_request_info * info,
1214 union iwreq_data * wrqu,
1215 char * extra)
1216{
1217 struct iw_spy_data * spydata = get_spydata(dev);
1218 struct sockaddr * address = (struct sockaddr *) extra;
1219
1220 if(!dev->wireless_data)
1221 /* Help user know that driver needs updating */
1222 printk(KERN_DEBUG "%s (WE) : Driver using old/buggy spy support, please fix driver !\n",
1223 dev->name);
1224 /* Make sure driver is not buggy or using the old API */
1225 if(!spydata)
1226 return -EOPNOTSUPP;
1227
1228 /* Disable spy collection while we copy the addresses.
1229 * While we copy addresses, any call to wireless_spy_update()
1230 * will NOP. This is OK, as anyway the addresses are changing. */
1231 spydata->spy_number = 0;
1232
1233 /* We want to operate without locking, because wireless_spy_update()
1234 * most likely will happen in the interrupt handler, and therefore
1235 * have its own locking constraints and needs performance.
1236 * The rtnl_lock() make sure we don't race with the other iw_handlers.
1237 * This make sure wireless_spy_update() "see" that the spy list
1238 * is temporarily disabled. */
1239 wmb();
1240
1241 /* Are there are addresses to copy? */
1242 if(wrqu->data.length > 0) {
1243 int i;
1244
1245 /* Copy addresses */
1246 for(i = 0; i < wrqu->data.length; i++)
1247 memcpy(spydata->spy_address[i], address[i].sa_data,
1248 ETH_ALEN);
1249 /* Reset stats */
1250 memset(spydata->spy_stat, 0,
1251 sizeof(struct iw_quality) * IW_MAX_SPY);
1252
1253#ifdef WE_SPY_DEBUG
1254 printk(KERN_DEBUG "iw_handler_set_spy() : offset %ld, spydata %p, num %d\n", dev->wireless_handlers->spy_offset, spydata, wrqu->data.length);
1255 for (i = 0; i < wrqu->data.length; i++)
1256 printk(KERN_DEBUG
1257 "%02X:%02X:%02X:%02X:%02X:%02X \n",
1258 spydata->spy_address[i][0],
1259 spydata->spy_address[i][1],
1260 spydata->spy_address[i][2],
1261 spydata->spy_address[i][3],
1262 spydata->spy_address[i][4],
1263 spydata->spy_address[i][5]);
1264#endif /* WE_SPY_DEBUG */
1265 }
1266
1267 /* Make sure above is updated before re-enabling */
1268 wmb();
1269
1270 /* Enable addresses */
1271 spydata->spy_number = wrqu->data.length;
1272
1273 return 0;
1274}
1275
1276/*------------------------------------------------------------------*/
1277/*
1278 * Standard Wireless Handler : get Spy List
1279 */
1280int iw_handler_get_spy(struct net_device * dev,
1281 struct iw_request_info * info,
1282 union iwreq_data * wrqu,
1283 char * extra)
1284{
1285 struct iw_spy_data * spydata = get_spydata(dev);
1286 struct sockaddr * address = (struct sockaddr *) extra;
1287 int i;
1288
1289 /* Make sure driver is not buggy or using the old API */
1290 if(!spydata)
1291 return -EOPNOTSUPP;
1292
1293 wrqu->data.length = spydata->spy_number;
1294
1295 /* Copy addresses. */
1296 for(i = 0; i < spydata->spy_number; i++) {
1297 memcpy(address[i].sa_data, spydata->spy_address[i], ETH_ALEN);
1298 address[i].sa_family = AF_UNIX;
1299 }
1300 /* Copy stats to the user buffer (just after). */
1301 if(spydata->spy_number > 0)
1302 memcpy(extra + (sizeof(struct sockaddr) *spydata->spy_number),
1303 spydata->spy_stat,
1304 sizeof(struct iw_quality) * spydata->spy_number);
1305 /* Reset updated flags. */
1306 for(i = 0; i < spydata->spy_number; i++)
1307 spydata->spy_stat[i].updated = 0;
1308 return 0;
1309}
1310
1311/*------------------------------------------------------------------*/
1312/*
1313 * Standard Wireless Handler : set spy threshold
1314 */
1315int iw_handler_set_thrspy(struct net_device * dev,
1316 struct iw_request_info *info,
1317 union iwreq_data * wrqu,
1318 char * extra)
1319{
1320 struct iw_spy_data * spydata = get_spydata(dev);
1321 struct iw_thrspy * threshold = (struct iw_thrspy *) extra;
1322
1323 /* Make sure driver is not buggy or using the old API */
1324 if(!spydata)
1325 return -EOPNOTSUPP;
1326
1327 /* Just do it */
1328 memcpy(&(spydata->spy_thr_low), &(threshold->low),
1329 2 * sizeof(struct iw_quality));
1330
1331 /* Clear flag */
1332 memset(spydata->spy_thr_under, '\0', sizeof(spydata->spy_thr_under));
1333
1334#ifdef WE_SPY_DEBUG
1335 printk(KERN_DEBUG "iw_handler_set_thrspy() : low %d ; high %d\n", spydata->spy_thr_low.level, spydata->spy_thr_high.level);
1336#endif /* WE_SPY_DEBUG */
1337
1338 return 0;
1339}
1340
1341/*------------------------------------------------------------------*/
1342/*
1343 * Standard Wireless Handler : get spy threshold
1344 */
1345int iw_handler_get_thrspy(struct net_device * dev,
1346 struct iw_request_info *info,
1347 union iwreq_data * wrqu,
1348 char * extra)
1349{
1350 struct iw_spy_data * spydata = get_spydata(dev);
1351 struct iw_thrspy * threshold = (struct iw_thrspy *) extra;
1352
1353 /* Make sure driver is not buggy or using the old API */
1354 if(!spydata)
1355 return -EOPNOTSUPP;
1356
1357 /* Just do it */
1358 memcpy(&(threshold->low), &(spydata->spy_thr_low),
1359 2 * sizeof(struct iw_quality));
1360
1361 return 0;
1362}
1363
1364/*------------------------------------------------------------------*/
1365/*
1366 * Prepare and send a Spy Threshold event
1367 */
1368static void iw_send_thrspy_event(struct net_device * dev,
1369 struct iw_spy_data * spydata,
1370 unsigned char * address,
1371 struct iw_quality * wstats)
1372{
1373 union iwreq_data wrqu;
1374 struct iw_thrspy threshold;
1375
1376 /* Init */
1377 wrqu.data.length = 1;
1378 wrqu.data.flags = 0;
1379 /* Copy address */
1380 memcpy(threshold.addr.sa_data, address, ETH_ALEN);
1381 threshold.addr.sa_family = ARPHRD_ETHER;
1382 /* Copy stats */
1383 memcpy(&(threshold.qual), wstats, sizeof(struct iw_quality));
1384 /* Copy also thresholds */
1385 memcpy(&(threshold.low), &(spydata->spy_thr_low),
1386 2 * sizeof(struct iw_quality));
1387
1388#ifdef WE_SPY_DEBUG
1389 printk(KERN_DEBUG "iw_send_thrspy_event() : address %02X:%02X:%02X:%02X:%02X:%02X, level %d, up = %d\n",
1390 threshold.addr.sa_data[0],
1391 threshold.addr.sa_data[1],
1392 threshold.addr.sa_data[2],
1393 threshold.addr.sa_data[3],
1394 threshold.addr.sa_data[4],
1395 threshold.addr.sa_data[5], threshold.qual.level);
1396#endif /* WE_SPY_DEBUG */
1397
1398 /* Send event to user space */
1399 wireless_send_event(dev, SIOCGIWTHRSPY, &wrqu, (char *) &threshold);
1400}
1401
1402/* ---------------------------------------------------------------- */
1403/*
1404 * Call for the driver to update the spy data.
1405 * For now, the spy data is a simple array. As the size of the array is
1406 * small, this is good enough. If we wanted to support larger number of
1407 * spy addresses, we should use something more efficient...
1408 */
1409void wireless_spy_update(struct net_device * dev,
1410 unsigned char * address,
1411 struct iw_quality * wstats)
1412{
1413 struct iw_spy_data * spydata = get_spydata(dev);
1414 int i;
1415 int match = -1;
1416
1417 /* Make sure driver is not buggy or using the old API */
1418 if(!spydata)
1419 return;
1420
1421#ifdef WE_SPY_DEBUG
1422 printk(KERN_DEBUG "wireless_spy_update() : offset %ld, spydata %p, address %02X:%02X:%02X:%02X:%02X:%02X\n", dev->wireless_handlers->spy_offset, spydata, address[0], address[1], address[2], address[3], address[4], address[5]);
1423#endif /* WE_SPY_DEBUG */
1424
1425 /* Update all records that match */
1426 for(i = 0; i < spydata->spy_number; i++)
1427 if(!memcmp(address, spydata->spy_address[i], ETH_ALEN)) {
1428 memcpy(&(spydata->spy_stat[i]), wstats,
1429 sizeof(struct iw_quality));
1430 match = i;
1431 }
1432
1433 /* Generate an event if we cross the spy threshold.
1434 * To avoid event storms, we have a simple hysteresis : we generate
1435 * event only when we go under the low threshold or above the
1436 * high threshold. */
1437 if(match >= 0) {
1438 if(spydata->spy_thr_under[match]) {
1439 if(wstats->level > spydata->spy_thr_high.level) {
1440 spydata->spy_thr_under[match] = 0;
1441 iw_send_thrspy_event(dev, spydata,
1442 address, wstats);
1443 }
1444 } else {
1445 if(wstats->level < spydata->spy_thr_low.level) {
1446 spydata->spy_thr_under[match] = 1;
1447 iw_send_thrspy_event(dev, spydata,
1448 address, wstats);
1449 }
1450 }
1451 }
1452}
1453
1454EXPORT_SYMBOL(iw_handler_get_spy);
1455EXPORT_SYMBOL(iw_handler_get_thrspy);
1456EXPORT_SYMBOL(iw_handler_set_spy);
1457EXPORT_SYMBOL(iw_handler_set_thrspy);
1458EXPORT_SYMBOL(wireless_send_event);
1459EXPORT_SYMBOL(wireless_spy_update);