aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJohann Baudy <johann.baudy@gnu-log.net>2009-05-19 01:11:22 -0400
committerDavid S. Miller <davem@davemloft.net>2009-05-19 01:11:22 -0400
commit69e3c75f4d541a6eb151b3ef91f34033cb3ad6e1 (patch)
tree24920f17ea435627978af9d5fe0e99763bf6a533
parentf67f34084914144de55c785163d047d5d8dddd2d (diff)
net: TX_RING and packet mmap
New packet socket feature that makes packet socket more efficient for transmission. - It reduces number of system call through a PACKET_TX_RING mechanism, based on PACKET_RX_RING (Circular buffer allocated in kernel space which is mmapped from user space). - It minimizes CPU copy using fragmented SKB (almost zero copy). Signed-off-by: Johann Baudy <johann.baudy@gnu-log.net> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--Documentation/networking/packet_mmap.txt140
-rw-r--r--include/linux/if_packet.h20
-rw-r--r--include/linux/skbuff.h3
-rw-r--r--net/packet/af_packet.c588
4 files changed, 616 insertions, 135 deletions
diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt
index 07c53d596035..a22fd85e3796 100644
--- a/Documentation/networking/packet_mmap.txt
+++ b/Documentation/networking/packet_mmap.txt
@@ -4,16 +4,18 @@
4 4
5This file documents the CONFIG_PACKET_MMAP option available with the PACKET 5This file documents the CONFIG_PACKET_MMAP option available with the PACKET
6socket interface on 2.4 and 2.6 kernels. This type of sockets is used for 6socket interface on 2.4 and 2.6 kernels. This type of sockets is used for
7capture network traffic with utilities like tcpdump or any other that uses 7capture network traffic with utilities like tcpdump or any other that needs
8the libpcap library. 8raw access to network interface.
9
10You can find the latest version of this document at
11 9
10You can find the latest version of this document at:
12 http://pusa.uv.es/~ulisses/packet_mmap/ 11 http://pusa.uv.es/~ulisses/packet_mmap/
13 12
14Please send me your comments to 13Howto can be found at:
14 http://wiki.gnu-log.net (packet_mmap)
15 15
16Please send your comments to
16 Ulisses Alonso CamarĂ³ <uaca@i.hate.spam.alumni.uv.es> 17 Ulisses Alonso CamarĂ³ <uaca@i.hate.spam.alumni.uv.es>
18 Johann Baudy <johann.baudy@gnu-log.net>
17 19
18------------------------------------------------------------------------------- 20-------------------------------------------------------------------------------
19+ Why use PACKET_MMAP 21+ Why use PACKET_MMAP
@@ -25,19 +27,24 @@ to capture each packet, it requires two if you want to get packet's
25timestamp (like libpcap always does). 27timestamp (like libpcap always does).
26 28
27In the other hand PACKET_MMAP is very efficient. PACKET_MMAP provides a size 29In the other hand PACKET_MMAP is very efficient. PACKET_MMAP provides a size
28configurable circular buffer mapped in user space. This way reading packets just 30configurable circular buffer mapped in user space that can be used to either
29needs to wait for them, most of the time there is no need to issue a single 31send or receive packets. This way reading packets just needs to wait for them,
30system call. By using a shared buffer between the kernel and the user 32most of the time there is no need to issue a single system call. Concerning
31also has the benefit of minimizing packet copies. 33transmission, multiple packets can be sent through one system call to get the
32 34highest bandwidth.
33It's fine to use PACKET_MMAP to improve the performance of the capture process, 35By using a shared buffer between the kernel and the user also has the benefit
34but it isn't everything. At least, if you are capturing at high speeds (this 36of minimizing packet copies.
35is relative to the cpu speed), you should check if the device driver of your 37
36network interface card supports some sort of interrupt load mitigation or 38It's fine to use PACKET_MMAP to improve the performance of the capture and
37(even better) if it supports NAPI, also make sure it is enabled. 39transmission process, but it isn't everything. At least, if you are capturing
40at high speeds (this is relative to the cpu speed), you should check if the
41device driver of your network interface card supports some sort of interrupt
42load mitigation or (even better) if it supports NAPI, also make sure it is
43enabled. For transmission, check the MTU (Maximum Transmission Unit) used and
44supported by devices of your network.
38 45
39-------------------------------------------------------------------------------- 46--------------------------------------------------------------------------------
40+ How to use CONFIG_PACKET_MMAP 47+ How to use CONFIG_PACKET_MMAP to improve capture process
41-------------------------------------------------------------------------------- 48--------------------------------------------------------------------------------
42 49
43From the user standpoint, you should use the higher level libpcap library, which 50From the user standpoint, you should use the higher level libpcap library, which
@@ -57,7 +64,7 @@ the low level details or want to improve libpcap by including PACKET_MMAP
57support. 64support.
58 65
59-------------------------------------------------------------------------------- 66--------------------------------------------------------------------------------
60+ How to use CONFIG_PACKET_MMAP directly 67+ How to use CONFIG_PACKET_MMAP directly to improve capture process
61-------------------------------------------------------------------------------- 68--------------------------------------------------------------------------------
62 69
63From the system calls stand point, the use of PACKET_MMAP involves 70From the system calls stand point, the use of PACKET_MMAP involves
@@ -66,6 +73,7 @@ the following process:
66 73
67[setup] socket() -------> creation of the capture socket 74[setup] socket() -------> creation of the capture socket
68 setsockopt() ---> allocation of the circular buffer (ring) 75 setsockopt() ---> allocation of the circular buffer (ring)
76 option: PACKET_RX_RING
69 mmap() ---------> mapping of the allocated buffer to the 77 mmap() ---------> mapping of the allocated buffer to the
70 user process 78 user process
71 79
@@ -97,13 +105,75 @@ also the mapping of the circular buffer in the user process and
97the use of this buffer. 105the use of this buffer.
98 106
99-------------------------------------------------------------------------------- 107--------------------------------------------------------------------------------
108+ How to use CONFIG_PACKET_MMAP directly to improve transmission process
109--------------------------------------------------------------------------------
110Transmission process is similar to capture as shown below.
111
112[setup] socket() -------> creation of the transmission socket
113 setsockopt() ---> allocation of the circular buffer (ring)
114 option: PACKET_TX_RING
115 bind() ---------> bind transmission socket with a network interface
116 mmap() ---------> mapping of the allocated buffer to the
117 user process
118
119[transmission] poll() ---------> wait for free packets (optional)
120 send() ---------> send all packets that are set as ready in
121 the ring
122 The flag MSG_DONTWAIT can be used to return
123 before end of transfer.
124
125[shutdown] close() --------> destruction of the transmission socket and
126 deallocation of all associated resources.
127
128Binding the socket to your network interface is mandatory (with zero copy) to
129know the header size of frames used in the circular buffer.
130
131As capture, each frame contains two parts:
132
133 --------------------
134| struct tpacket_hdr | Header. It contains the status of
135| | of this frame
136|--------------------|
137| data buffer |
138. . Data that will be sent over the network interface.
139. .
140 --------------------
141
142 bind() associates the socket to your network interface thanks to
143 sll_ifindex parameter of struct sockaddr_ll.
144
145 Initialization example:
146
147 struct sockaddr_ll my_addr;
148 struct ifreq s_ifr;
149 ...
150
151 strncpy (s_ifr.ifr_name, "eth0", sizeof(s_ifr.ifr_name));
152
153 /* get interface index of eth0 */
154 ioctl(this->socket, SIOCGIFINDEX, &s_ifr);
155
156 /* fill sockaddr_ll struct to prepare binding */
157 my_addr.sll_family = AF_PACKET;
158 my_addr.sll_protocol = ETH_P_ALL;
159 my_addr.sll_ifindex = s_ifr.ifr_ifindex;
160
161 /* bind socket to eth0 */
162 bind(this->socket, (struct sockaddr *)&my_addr, sizeof(struct sockaddr_ll));
163
164 A complete tutorial is available at: http://wiki.gnu-log.net/
165
166--------------------------------------------------------------------------------
100+ PACKET_MMAP settings 167+ PACKET_MMAP settings
101-------------------------------------------------------------------------------- 168--------------------------------------------------------------------------------
102 169
103 170
104To setup PACKET_MMAP from user level code is done with a call like 171To setup PACKET_MMAP from user level code is done with a call like
105 172
173 - Capture process
106 setsockopt(fd, SOL_PACKET, PACKET_RX_RING, (void *) &req, sizeof(req)) 174 setsockopt(fd, SOL_PACKET, PACKET_RX_RING, (void *) &req, sizeof(req))
175 - Transmission process
176 setsockopt(fd, SOL_PACKET, PACKET_TX_RING, (void *) &req, sizeof(req))
107 177
108The most significant argument in the previous call is the req parameter, 178The most significant argument in the previous call is the req parameter,
109this parameter must to have the following structure: 179this parameter must to have the following structure:
@@ -117,11 +187,11 @@ this parameter must to have the following structure:
117 }; 187 };
118 188
119This structure is defined in /usr/include/linux/if_packet.h and establishes a 189This structure is defined in /usr/include/linux/if_packet.h and establishes a
120circular buffer (ring) of unswappable memory mapped in the capture process. 190circular buffer (ring) of unswappable memory.
121Being mapped in the capture process allows reading the captured frames and 191Being mapped in the capture process allows reading the captured frames and
122related meta-information like timestamps without requiring a system call. 192related meta-information like timestamps without requiring a system call.
123 193
124Captured frames are grouped in blocks. Each block is a physically contiguous 194Frames are grouped in blocks. Each block is a physically contiguous
125region of memory and holds tp_block_size/tp_frame_size frames. The total number 195region of memory and holds tp_block_size/tp_frame_size frames. The total number
126of blocks is tp_block_nr. Note that tp_frame_nr is a redundant parameter because 196of blocks is tp_block_nr. Note that tp_frame_nr is a redundant parameter because
127 197
@@ -336,6 +406,7 @@ struct tpacket_hdr). If this field is 0 means that the frame is ready
336to be used for the kernel, If not, there is a frame the user can read 406to be used for the kernel, If not, there is a frame the user can read
337and the following flags apply: 407and the following flags apply:
338 408
409+++ Capture process:
339 from include/linux/if_packet.h 410 from include/linux/if_packet.h
340 411
341 #define TP_STATUS_COPY 2 412 #define TP_STATUS_COPY 2
@@ -391,6 +462,37 @@ packets are in the ring:
391It doesn't incur in a race condition to first check the status value and 462It doesn't incur in a race condition to first check the status value and
392then poll for frames. 463then poll for frames.
393 464
465
466++ Transmission process
467Those defines are also used for transmission:
468
469 #define TP_STATUS_AVAILABLE 0 // Frame is available
470 #define TP_STATUS_SEND_REQUEST 1 // Frame will be sent on next send()
471 #define TP_STATUS_SENDING 2 // Frame is currently in transmission
472 #define TP_STATUS_WRONG_FORMAT 4 // Frame format is not correct
473
474First, the kernel initializes all frames to TP_STATUS_AVAILABLE. To send a
475packet, the user fills a data buffer of an available frame, sets tp_len to
476current data buffer size and sets its status field to TP_STATUS_SEND_REQUEST.
477This can be done on multiple frames. Once the user is ready to transmit, it
478calls send(). Then all buffers with status equal to TP_STATUS_SEND_REQUEST are
479forwarded to the network device. The kernel updates each status of sent
480frames with TP_STATUS_SENDING until the end of transfer.
481At the end of each transfer, buffer status returns to TP_STATUS_AVAILABLE.
482
483 header->tp_len = in_i_size;
484 header->tp_status = TP_STATUS_SEND_REQUEST;
485 retval = send(this->socket, NULL, 0, 0);
486
487The user can also use poll() to check if a buffer is available:
488(status == TP_STATUS_SENDING)
489
490 struct pollfd pfd;
491 pfd.fd = fd;
492 pfd.revents = 0;
493 pfd.events = POLLOUT;
494 retval = poll(&pfd, 1, timeout);
495
394-------------------------------------------------------------------------------- 496--------------------------------------------------------------------------------
395+ THANKS 497+ THANKS
396-------------------------------------------------------------------------------- 498--------------------------------------------------------------------------------
diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
index 18db0668065a..5b2badeb9497 100644
--- a/include/linux/if_packet.h
+++ b/include/linux/if_packet.h
@@ -46,6 +46,8 @@ struct sockaddr_ll
46#define PACKET_VERSION 10 46#define PACKET_VERSION 10
47#define PACKET_HDRLEN 11 47#define PACKET_HDRLEN 11
48#define PACKET_RESERVE 12 48#define PACKET_RESERVE 12
49#define PACKET_TX_RING 13
50#define PACKET_LOSS 14
49 51
50struct tpacket_stats 52struct tpacket_stats
51{ 53{
@@ -63,14 +65,22 @@ struct tpacket_auxdata
63 __u16 tp_vlan_tci; 65 __u16 tp_vlan_tci;
64}; 66};
65 67
68/* Rx ring - header status */
69#define TP_STATUS_KERNEL 0x0
70#define TP_STATUS_USER 0x1
71#define TP_STATUS_COPY 0x2
72#define TP_STATUS_LOSING 0x4
73#define TP_STATUS_CSUMNOTREADY 0x8
74
75/* Tx ring - header status */
76#define TP_STATUS_AVAILABLE 0x0
77#define TP_STATUS_SEND_REQUEST 0x1
78#define TP_STATUS_SENDING 0x2
79#define TP_STATUS_WRONG_FORMAT 0x4
80
66struct tpacket_hdr 81struct tpacket_hdr
67{ 82{
68 unsigned long tp_status; 83 unsigned long tp_status;
69#define TP_STATUS_KERNEL 0
70#define TP_STATUS_USER 1
71#define TP_STATUS_COPY 2
72#define TP_STATUS_LOSING 4
73#define TP_STATUS_CSUMNOTREADY 8
74 unsigned int tp_len; 84 unsigned int tp_len;
75 unsigned int tp_snaplen; 85 unsigned int tp_snaplen;
76 unsigned short tp_mac; 86 unsigned short tp_mac;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 1b5c3d298f43..aff494ba6a31 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -203,6 +203,9 @@ struct skb_shared_info {
203#ifdef CONFIG_HAS_DMA 203#ifdef CONFIG_HAS_DMA
204 dma_addr_t dma_maps[MAX_SKB_FRAGS + 1]; 204 dma_addr_t dma_maps[MAX_SKB_FRAGS + 1];
205#endif 205#endif
206 /* Intermediate layers must ensure that destructor_arg
207 * remains valid until skb destructor */
208 void * destructor_arg;
206}; 209};
207 210
208/* We divide dataref into two halves. The higher 16 bits hold references 211/* We divide dataref into two halves. The higher 16 bits hold references
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index f546e81acc45..766e6b41f7ca 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -39,6 +39,7 @@
39 * will simply extend the hardware address 39 * will simply extend the hardware address
40 * byte arrays at the end of sockaddr_ll 40 * byte arrays at the end of sockaddr_ll
41 * and packet_mreq. 41 * and packet_mreq.
42 * Johann Baudy : Added TX RING.
42 * 43 *
43 * This program is free software; you can redistribute it and/or 44 * This program is free software; you can redistribute it and/or
44 * modify it under the terms of the GNU General Public License 45 * modify it under the terms of the GNU General Public License
@@ -157,7 +158,25 @@ struct packet_mreq_max
157}; 158};
158 159
159#ifdef CONFIG_PACKET_MMAP 160#ifdef CONFIG_PACKET_MMAP
160static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing); 161static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
162 int closing, int tx_ring);
163
164struct packet_ring_buffer {
165 char * *pg_vec;
166 unsigned int head;
167 unsigned int frames_per_block;
168 unsigned int frame_size;
169 unsigned int frame_max;
170
171 unsigned int pg_vec_order;
172 unsigned int pg_vec_pages;
173 unsigned int pg_vec_len;
174
175 atomic_t pending;
176};
177
178struct packet_sock;
179static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
161#endif 180#endif
162 181
163static void packet_flush_mclist(struct sock *sk); 182static void packet_flush_mclist(struct sock *sk);
@@ -167,11 +186,8 @@ struct packet_sock {
167 struct sock sk; 186 struct sock sk;
168 struct tpacket_stats stats; 187 struct tpacket_stats stats;
169#ifdef CONFIG_PACKET_MMAP 188#ifdef CONFIG_PACKET_MMAP
170 char * *pg_vec; 189 struct packet_ring_buffer rx_ring;
171 unsigned int head; 190 struct packet_ring_buffer tx_ring;
172 unsigned int frames_per_block;
173 unsigned int frame_size;
174 unsigned int frame_max;
175 int copy_thresh; 191 int copy_thresh;
176#endif 192#endif
177 struct packet_type prot_hook; 193 struct packet_type prot_hook;
@@ -185,12 +201,10 @@ struct packet_sock {
185 struct packet_mclist *mclist; 201 struct packet_mclist *mclist;
186#ifdef CONFIG_PACKET_MMAP 202#ifdef CONFIG_PACKET_MMAP
187 atomic_t mapped; 203 atomic_t mapped;
188 unsigned int pg_vec_order;
189 unsigned int pg_vec_pages;
190 unsigned int pg_vec_len;
191 enum tpacket_versions tp_version; 204 enum tpacket_versions tp_version;
192 unsigned int tp_hdrlen; 205 unsigned int tp_hdrlen;
193 unsigned int tp_reserve; 206 unsigned int tp_reserve;
207 unsigned int tp_loss:1;
194#endif 208#endif
195}; 209};
196 210
@@ -206,36 +220,33 @@ struct packet_skb_cb {
206 220
207#ifdef CONFIG_PACKET_MMAP 221#ifdef CONFIG_PACKET_MMAP
208 222
209static void *packet_lookup_frame(struct packet_sock *po, unsigned int position, 223static void __packet_set_status(struct packet_sock *po, void *frame, int status)
210 int status)
211{ 224{
212 unsigned int pg_vec_pos, frame_offset;
213 union { 225 union {
214 struct tpacket_hdr *h1; 226 struct tpacket_hdr *h1;
215 struct tpacket2_hdr *h2; 227 struct tpacket2_hdr *h2;
216 void *raw; 228 void *raw;
217 } h; 229 } h;
218 230
219 pg_vec_pos = position / po->frames_per_block; 231 h.raw = frame;
220 frame_offset = position % po->frames_per_block;
221
222 h.raw = po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size);
223 switch (po->tp_version) { 232 switch (po->tp_version) {
224 case TPACKET_V1: 233 case TPACKET_V1:
225 if (status != (h.h1->tp_status ? TP_STATUS_USER : 234 h.h1->tp_status = status;
226 TP_STATUS_KERNEL)) 235 flush_dcache_page(virt_to_page(&h.h1->tp_status));
227 return NULL;
228 break; 236 break;
229 case TPACKET_V2: 237 case TPACKET_V2:
230 if (status != (h.h2->tp_status ? TP_STATUS_USER : 238 h.h2->tp_status = status;
231 TP_STATUS_KERNEL)) 239 flush_dcache_page(virt_to_page(&h.h2->tp_status));
232 return NULL;
233 break; 240 break;
241 default:
242 printk(KERN_ERR "TPACKET version not supported\n");
243 BUG();
234 } 244 }
235 return h.raw; 245
246 smp_wmb();
236} 247}
237 248
238static void __packet_set_status(struct packet_sock *po, void *frame, int status) 249static int __packet_get_status(struct packet_sock *po, void *frame)
239{ 250{
240 union { 251 union {
241 struct tpacket_hdr *h1; 252 struct tpacket_hdr *h1;
@@ -243,16 +254,66 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
243 void *raw; 254 void *raw;
244 } h; 255 } h;
245 256
257 smp_rmb();
258
246 h.raw = frame; 259 h.raw = frame;
247 switch (po->tp_version) { 260 switch (po->tp_version) {
248 case TPACKET_V1: 261 case TPACKET_V1:
249 h.h1->tp_status = status; 262 flush_dcache_page(virt_to_page(&h.h1->tp_status));
250 break; 263 return h.h1->tp_status;
251 case TPACKET_V2: 264 case TPACKET_V2:
252 h.h2->tp_status = status; 265 flush_dcache_page(virt_to_page(&h.h2->tp_status));
253 break; 266 return h.h2->tp_status;
267 default:
268 printk(KERN_ERR "TPACKET version not supported\n");
269 BUG();
270 return 0;
254 } 271 }
255} 272}
273
274static void *packet_lookup_frame(struct packet_sock *po,
275 struct packet_ring_buffer *rb,
276 unsigned int position,
277 int status)
278{
279 unsigned int pg_vec_pos, frame_offset;
280 union {
281 struct tpacket_hdr *h1;
282 struct tpacket2_hdr *h2;
283 void *raw;
284 } h;
285
286 pg_vec_pos = position / rb->frames_per_block;
287 frame_offset = position % rb->frames_per_block;
288
289 h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
290
291 if (status != __packet_get_status(po, h.raw))
292 return NULL;
293
294 return h.raw;
295}
296
297static inline void *packet_current_frame(struct packet_sock *po,
298 struct packet_ring_buffer *rb,
299 int status)
300{
301 return packet_lookup_frame(po, rb, rb->head, status);
302}
303
304static inline void *packet_previous_frame(struct packet_sock *po,
305 struct packet_ring_buffer *rb,
306 int status)
307{
308 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
309 return packet_lookup_frame(po, rb, previous, status);
310}
311
312static inline void packet_increment_head(struct packet_ring_buffer *buff)
313{
314 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
315}
316
256#endif 317#endif
257 318
258static inline struct packet_sock *pkt_sk(struct sock *sk) 319static inline struct packet_sock *pkt_sk(struct sock *sk)
@@ -648,7 +709,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
648 macoff = netoff - maclen; 709 macoff = netoff - maclen;
649 } 710 }
650 711
651 if (macoff + snaplen > po->frame_size) { 712 if (macoff + snaplen > po->rx_ring.frame_size) {
652 if (po->copy_thresh && 713 if (po->copy_thresh &&
653 atomic_read(&sk->sk_rmem_alloc) + skb->truesize < 714 atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
654 (unsigned)sk->sk_rcvbuf) { 715 (unsigned)sk->sk_rcvbuf) {
@@ -661,16 +722,16 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
661 if (copy_skb) 722 if (copy_skb)
662 skb_set_owner_r(copy_skb, sk); 723 skb_set_owner_r(copy_skb, sk);
663 } 724 }
664 snaplen = po->frame_size - macoff; 725 snaplen = po->rx_ring.frame_size - macoff;
665 if ((int)snaplen < 0) 726 if ((int)snaplen < 0)
666 snaplen = 0; 727 snaplen = 0;
667 } 728 }
668 729
669 spin_lock(&sk->sk_receive_queue.lock); 730 spin_lock(&sk->sk_receive_queue.lock);
670 h.raw = packet_lookup_frame(po, po->head, TP_STATUS_KERNEL); 731 h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
671 if (!h.raw) 732 if (!h.raw)
672 goto ring_is_full; 733 goto ring_is_full;
673 po->head = po->head != po->frame_max ? po->head+1 : 0; 734 packet_increment_head(&po->rx_ring);
674 po->stats.tp_packets++; 735 po->stats.tp_packets++;
675 if (copy_skb) { 736 if (copy_skb) {
676 status |= TP_STATUS_COPY; 737 status |= TP_STATUS_COPY;
@@ -727,7 +788,6 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
727 788
728 __packet_set_status(po, h.raw, status); 789 __packet_set_status(po, h.raw, status);
729 smp_mb(); 790 smp_mb();
730
731 { 791 {
732 struct page *p_start, *p_end; 792 struct page *p_start, *p_end;
733 u8 *h_end = h.raw + macoff + snaplen - 1; 793 u8 *h_end = h.raw + macoff + snaplen - 1;
@@ -760,10 +820,249 @@ ring_is_full:
760 goto drop_n_restore; 820 goto drop_n_restore;
761} 821}
762 822
763#endif 823static void tpacket_destruct_skb(struct sk_buff *skb)
824{
825 struct packet_sock *po = pkt_sk(skb->sk);
826 void * ph;
764 827
828 BUG_ON(skb == NULL);
765 829
766static int packet_sendmsg(struct kiocb *iocb, struct socket *sock, 830 if (likely(po->tx_ring.pg_vec)) {
831 ph = skb_shinfo(skb)->destructor_arg;
832 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
833 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
834 atomic_dec(&po->tx_ring.pending);
835 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
836 }
837
838 sock_wfree(skb);
839}
840
841static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff * skb,
842 void * frame, struct net_device *dev, int size_max,
843 __be16 proto, unsigned char * addr)
844{
845 union {
846 struct tpacket_hdr *h1;
847 struct tpacket2_hdr *h2;
848 void *raw;
849 } ph;
850 int to_write, offset, len, tp_len, nr_frags, len_max;
851 struct socket *sock = po->sk.sk_socket;
852 struct page *page;
853 void *data;
854 int err;
855
856 ph.raw = frame;
857
858 skb->protocol = proto;
859 skb->dev = dev;
860 skb->priority = po->sk.sk_priority;
861 skb_shinfo(skb)->destructor_arg = ph.raw;
862
863 switch (po->tp_version) {
864 case TPACKET_V2:
865 tp_len = ph.h2->tp_len;
866 break;
867 default:
868 tp_len = ph.h1->tp_len;
869 break;
870 }
871 if (unlikely(tp_len > size_max)) {
872 printk(KERN_ERR "packet size is too long (%d > %d)\n",
873 tp_len, size_max);
874 return -EMSGSIZE;
875 }
876
877 skb_reserve(skb, LL_RESERVED_SPACE(dev));
878 skb_reset_network_header(skb);
879
880 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
881 to_write = tp_len;
882
883 if (sock->type == SOCK_DGRAM) {
884 err = dev_hard_header(skb, dev, ntohs(proto), addr,
885 NULL, tp_len);
886 if (unlikely(err < 0))
887 return -EINVAL;
888 } else if (dev->hard_header_len ) {
889 /* net device doesn't like empty head */
890 if (unlikely(tp_len <= dev->hard_header_len)) {
891 printk(KERN_ERR "packet size is too short "
892 "(%d < %d)\n", tp_len,
893 dev->hard_header_len);
894 return -EINVAL;
895 }
896
897 skb_push(skb, dev->hard_header_len);
898 err = skb_store_bits(skb, 0, data,
899 dev->hard_header_len);
900 if (unlikely(err))
901 return err;
902
903 data += dev->hard_header_len;
904 to_write -= dev->hard_header_len;
905 }
906
907 err = -EFAULT;
908 page = virt_to_page(data);
909 offset = offset_in_page(data);
910 len_max = PAGE_SIZE - offset;
911 len = ((to_write > len_max) ? len_max : to_write);
912
913 skb->data_len = to_write;
914 skb->len += to_write;
915 skb->truesize += to_write;
916 atomic_add(to_write, &po->sk.sk_wmem_alloc);
917
918 while (likely(to_write)) {
919 nr_frags = skb_shinfo(skb)->nr_frags;
920
921 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
922 printk(KERN_ERR "Packet exceed the number "
923 "of skb frags(%lu)\n",
924 MAX_SKB_FRAGS);
925 return -EFAULT;
926 }
927
928 flush_dcache_page(page);
929 get_page(page);
930 skb_fill_page_desc(skb,
931 nr_frags,
932 page++, offset, len);
933 to_write -= len;
934 offset = 0;
935 len_max = PAGE_SIZE;
936 len = ((to_write > len_max) ? len_max : to_write);
937 }
938
939 return tp_len;
940}
941
942static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
943{
944 struct socket *sock;
945 struct sk_buff *skb;
946 struct net_device *dev;
947 __be16 proto;
948 int ifindex, err, reserve = 0;
949 void * ph;
950 struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
951 int tp_len, size_max;
952 unsigned char *addr;
953 int len_sum = 0;
954 int status = 0;
955
956 sock = po->sk.sk_socket;
957
958 mutex_lock(&po->pg_vec_lock);
959
960 err = -EBUSY;
961 if (saddr == NULL) {
962 ifindex = po->ifindex;
963 proto = po->num;
964 addr = NULL;
965 } else {
966 err = -EINVAL;
967 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
968 goto out;
969 if (msg->msg_namelen < (saddr->sll_halen
970 + offsetof(struct sockaddr_ll,
971 sll_addr)))
972 goto out;
973 ifindex = saddr->sll_ifindex;
974 proto = saddr->sll_protocol;
975 addr = saddr->sll_addr;
976 }
977
978 dev = dev_get_by_index(sock_net(&po->sk), ifindex);
979 err = -ENXIO;
980 if (unlikely(dev == NULL))
981 goto out;
982
983 reserve = dev->hard_header_len;
984
985 err = -ENETDOWN;
986 if (unlikely(!(dev->flags & IFF_UP)))
987 goto out_put;
988
989 size_max = po->tx_ring.frame_size
990 - sizeof(struct skb_shared_info)
991 - po->tp_hdrlen
992 - LL_ALLOCATED_SPACE(dev)
993 - sizeof(struct sockaddr_ll);
994
995 if (size_max > dev->mtu + reserve)
996 size_max = dev->mtu + reserve;
997
998 do {
999 ph = packet_current_frame(po, &po->tx_ring,
1000 TP_STATUS_SEND_REQUEST);
1001
1002 if (unlikely(ph == NULL)) {
1003 schedule();
1004 continue;
1005 }
1006
1007 status = TP_STATUS_SEND_REQUEST;
1008 skb = sock_alloc_send_skb(&po->sk,
1009 LL_ALLOCATED_SPACE(dev)
1010 + sizeof(struct sockaddr_ll),
1011 0, &err);
1012
1013 if (unlikely(skb == NULL))
1014 goto out_status;
1015
1016 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1017 addr);
1018
1019 if (unlikely(tp_len < 0)) {
1020 if (po->tp_loss) {
1021 __packet_set_status(po, ph,
1022 TP_STATUS_AVAILABLE);
1023 packet_increment_head(&po->tx_ring);
1024 kfree_skb(skb);
1025 continue;
1026 } else {
1027 status = TP_STATUS_WRONG_FORMAT;
1028 err = tp_len;
1029 goto out_status;
1030 }
1031 }
1032
1033 skb->destructor = tpacket_destruct_skb;
1034 __packet_set_status(po, ph, TP_STATUS_SENDING);
1035 atomic_inc(&po->tx_ring.pending);
1036
1037 status = TP_STATUS_SEND_REQUEST;
1038 err = dev_queue_xmit(skb);
1039 if (unlikely(err > 0 && (err = net_xmit_errno(err)) != 0))
1040 goto out_xmit;
1041 packet_increment_head(&po->tx_ring);
1042 len_sum += tp_len;
1043 }
1044 while (likely((ph != NULL) || ((!(msg->msg_flags & MSG_DONTWAIT))
1045 && (atomic_read(&po->tx_ring.pending))))
1046 );
1047
1048 err = len_sum;
1049 goto out_put;
1050
1051out_xmit:
1052 skb->destructor = sock_wfree;
1053 atomic_dec(&po->tx_ring.pending);
1054out_status:
1055 __packet_set_status(po, ph, status);
1056 kfree_skb(skb);
1057out_put:
1058 dev_put(dev);
1059out:
1060 mutex_unlock(&po->pg_vec_lock);
1061 return err;
1062}
1063#endif
1064
1065static int packet_snd(struct socket *sock,
767 struct msghdr *msg, size_t len) 1066 struct msghdr *msg, size_t len)
768{ 1067{
769 struct sock *sk = sock->sk; 1068 struct sock *sk = sock->sk;
@@ -854,6 +1153,19 @@ out:
854 return err; 1153 return err;
855} 1154}
856 1155
1156static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1157 struct msghdr *msg, size_t len)
1158{
1159#ifdef CONFIG_PACKET_MMAP
1160 struct sock *sk = sock->sk;
1161 struct packet_sock *po = pkt_sk(sk);
1162 if (po->tx_ring.pg_vec)
1163 return tpacket_snd(po, msg);
1164 else
1165#endif
1166 return packet_snd(sock, msg, len);
1167}
1168
857/* 1169/*
858 * Close a PACKET socket. This is fairly simple. We immediately go 1170 * Close a PACKET socket. This is fairly simple. We immediately go
859 * to 'closed' state and remove our protocol entry in the device list. 1171 * to 'closed' state and remove our protocol entry in the device list.
@@ -864,6 +1176,9 @@ static int packet_release(struct socket *sock)
864 struct sock *sk = sock->sk; 1176 struct sock *sk = sock->sk;
865 struct packet_sock *po; 1177 struct packet_sock *po;
866 struct net *net; 1178 struct net *net;
1179#ifdef CONFIG_PACKET_MMAP
1180 struct tpacket_req req;
1181#endif
867 1182
868 if (!sk) 1183 if (!sk)
869 return 0; 1184 return 0;
@@ -893,11 +1208,13 @@ static int packet_release(struct socket *sock)
893 packet_flush_mclist(sk); 1208 packet_flush_mclist(sk);
894 1209
895#ifdef CONFIG_PACKET_MMAP 1210#ifdef CONFIG_PACKET_MMAP
896 if (po->pg_vec) { 1211 memset(&req, 0, sizeof(req));
897 struct tpacket_req req; 1212
898 memset(&req, 0, sizeof(req)); 1213 if (po->rx_ring.pg_vec)
899 packet_set_ring(sk, &req, 1); 1214 packet_set_ring(sk, &req, 1, 0);
900 } 1215
1216 if (po->tx_ring.pg_vec)
1217 packet_set_ring(sk, &req, 1, 1);
901#endif 1218#endif
902 1219
903 /* 1220 /*
@@ -1391,7 +1708,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
1391 if (level != SOL_PACKET) 1708 if (level != SOL_PACKET)
1392 return -ENOPROTOOPT; 1709 return -ENOPROTOOPT;
1393 1710
1394 switch(optname) { 1711 switch (optname) {
1395 case PACKET_ADD_MEMBERSHIP: 1712 case PACKET_ADD_MEMBERSHIP:
1396 case PACKET_DROP_MEMBERSHIP: 1713 case PACKET_DROP_MEMBERSHIP:
1397 { 1714 {
@@ -1415,6 +1732,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
1415 1732
1416#ifdef CONFIG_PACKET_MMAP 1733#ifdef CONFIG_PACKET_MMAP
1417 case PACKET_RX_RING: 1734 case PACKET_RX_RING:
1735 case PACKET_TX_RING:
1418 { 1736 {
1419 struct tpacket_req req; 1737 struct tpacket_req req;
1420 1738
@@ -1422,7 +1740,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
1422 return -EINVAL; 1740 return -EINVAL;
1423 if (copy_from_user(&req,optval,sizeof(req))) 1741 if (copy_from_user(&req,optval,sizeof(req)))
1424 return -EFAULT; 1742 return -EFAULT;
1425 return packet_set_ring(sk, &req, 0); 1743 return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1426 } 1744 }
1427 case PACKET_COPY_THRESH: 1745 case PACKET_COPY_THRESH:
1428 { 1746 {
@@ -1442,7 +1760,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
1442 1760
1443 if (optlen != sizeof(val)) 1761 if (optlen != sizeof(val))
1444 return -EINVAL; 1762 return -EINVAL;
1445 if (po->pg_vec) 1763 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1446 return -EBUSY; 1764 return -EBUSY;
1447 if (copy_from_user(&val, optval, sizeof(val))) 1765 if (copy_from_user(&val, optval, sizeof(val)))
1448 return -EFAULT; 1766 return -EFAULT;
@@ -1461,13 +1779,26 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
1461 1779
1462 if (optlen != sizeof(val)) 1780 if (optlen != sizeof(val))
1463 return -EINVAL; 1781 return -EINVAL;
1464 if (po->pg_vec) 1782 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1465 return -EBUSY; 1783 return -EBUSY;
1466 if (copy_from_user(&val, optval, sizeof(val))) 1784 if (copy_from_user(&val, optval, sizeof(val)))
1467 return -EFAULT; 1785 return -EFAULT;
1468 po->tp_reserve = val; 1786 po->tp_reserve = val;
1469 return 0; 1787 return 0;
1470 } 1788 }
1789 case PACKET_LOSS:
1790 {
1791 unsigned int val;
1792
1793 if (optlen != sizeof(val))
1794 return -EINVAL;
1795 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1796 return -EBUSY;
1797 if (copy_from_user(&val, optval, sizeof(val)))
1798 return -EFAULT;
1799 po->tp_loss = !!val;
1800 return 0;
1801 }
1471#endif 1802#endif
1472 case PACKET_AUXDATA: 1803 case PACKET_AUXDATA:
1473 { 1804 {
@@ -1517,7 +1848,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
1517 if (len < 0) 1848 if (len < 0)
1518 return -EINVAL; 1849 return -EINVAL;
1519 1850
1520 switch(optname) { 1851 switch (optname) {
1521 case PACKET_STATISTICS: 1852 case PACKET_STATISTICS:
1522 if (len > sizeof(struct tpacket_stats)) 1853 if (len > sizeof(struct tpacket_stats))
1523 len = sizeof(struct tpacket_stats); 1854 len = sizeof(struct tpacket_stats);
@@ -1573,6 +1904,12 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
1573 val = po->tp_reserve; 1904 val = po->tp_reserve;
1574 data = &val; 1905 data = &val;
1575 break; 1906 break;
1907 case PACKET_LOSS:
1908 if (len > sizeof(unsigned int))
1909 len = sizeof(unsigned int);
1910 val = po->tp_loss;
1911 data = &val;
1912 break;
1576#endif 1913#endif
1577 default: 1914 default:
1578 return -ENOPROTOOPT; 1915 return -ENOPROTOOPT;
@@ -1643,7 +1980,7 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd,
1643{ 1980{
1644 struct sock *sk = sock->sk; 1981 struct sock *sk = sock->sk;
1645 1982
1646 switch(cmd) { 1983 switch (cmd) {
1647 case SIOCOUTQ: 1984 case SIOCOUTQ:
1648 { 1985 {
1649 int amount = atomic_read(&sk->sk_wmem_alloc); 1986 int amount = atomic_read(&sk->sk_wmem_alloc);
@@ -1705,13 +2042,17 @@ static unsigned int packet_poll(struct file * file, struct socket *sock,
1705 unsigned int mask = datagram_poll(file, sock, wait); 2042 unsigned int mask = datagram_poll(file, sock, wait);
1706 2043
1707 spin_lock_bh(&sk->sk_receive_queue.lock); 2044 spin_lock_bh(&sk->sk_receive_queue.lock);
1708 if (po->pg_vec) { 2045 if (po->rx_ring.pg_vec) {
1709 unsigned last = po->head ? po->head-1 : po->frame_max; 2046 if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
1710
1711 if (packet_lookup_frame(po, last, TP_STATUS_USER))
1712 mask |= POLLIN | POLLRDNORM; 2047 mask |= POLLIN | POLLRDNORM;
1713 } 2048 }
1714 spin_unlock_bh(&sk->sk_receive_queue.lock); 2049 spin_unlock_bh(&sk->sk_receive_queue.lock);
2050 spin_lock_bh(&sk->sk_write_queue.lock);
2051 if (po->tx_ring.pg_vec) {
2052 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2053 mask |= POLLOUT | POLLWRNORM;
2054 }
2055 spin_unlock_bh(&sk->sk_write_queue.lock);
1715 return mask; 2056 return mask;
1716} 2057}
1717 2058
@@ -1788,21 +2129,33 @@ out_free_pgvec:
1788 goto out; 2129 goto out;
1789} 2130}
1790 2131
1791static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing) 2132static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2133 int closing, int tx_ring)
1792{ 2134{
1793 char **pg_vec = NULL; 2135 char **pg_vec = NULL;
1794 struct packet_sock *po = pkt_sk(sk); 2136 struct packet_sock *po = pkt_sk(sk);
1795 int was_running, order = 0; 2137 int was_running, order = 0;
2138 struct packet_ring_buffer *rb;
2139 struct sk_buff_head *rb_queue;
1796 __be16 num; 2140 __be16 num;
1797 int err = 0; 2141 int err;
1798 2142
1799 if (req->tp_block_nr) { 2143 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
1800 int i; 2144 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1801 2145
1802 /* Sanity tests and some calculations */ 2146 err = -EBUSY;
2147 if (!closing) {
2148 if (atomic_read(&po->mapped))
2149 goto out;
2150 if (atomic_read(&rb->pending))
2151 goto out;
2152 }
1803 2153
1804 if (unlikely(po->pg_vec)) 2154 if (req->tp_block_nr) {
1805 return -EBUSY; 2155 /* Sanity tests and some calculations */
2156 err = -EBUSY;
2157 if (unlikely(rb->pg_vec))
2158 goto out;
1806 2159
1807 switch (po->tp_version) { 2160 switch (po->tp_version) {
1808 case TPACKET_V1: 2161 case TPACKET_V1:
@@ -1813,42 +2166,35 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing
1813 break; 2166 break;
1814 } 2167 }
1815 2168
2169 err = -EINVAL;
1816 if (unlikely((int)req->tp_block_size <= 0)) 2170 if (unlikely((int)req->tp_block_size <= 0))
1817 return -EINVAL; 2171 goto out;
1818 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1))) 2172 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
1819 return -EINVAL; 2173 goto out;
1820 if (unlikely(req->tp_frame_size < po->tp_hdrlen + 2174 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
1821 po->tp_reserve)) 2175 po->tp_reserve))
1822 return -EINVAL; 2176 goto out;
1823 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1))) 2177 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
1824 return -EINVAL; 2178 goto out;
1825 2179
1826 po->frames_per_block = req->tp_block_size/req->tp_frame_size; 2180 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
1827 if (unlikely(po->frames_per_block <= 0)) 2181 if (unlikely(rb->frames_per_block <= 0))
1828 return -EINVAL; 2182 goto out;
1829 if (unlikely((po->frames_per_block * req->tp_block_nr) != 2183 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
1830 req->tp_frame_nr)) 2184 req->tp_frame_nr))
1831 return -EINVAL; 2185 goto out;
1832 2186
1833 err = -ENOMEM; 2187 err = -ENOMEM;
1834 order = get_order(req->tp_block_size); 2188 order = get_order(req->tp_block_size);
1835 pg_vec = alloc_pg_vec(req, order); 2189 pg_vec = alloc_pg_vec(req, order);
1836 if (unlikely(!pg_vec)) 2190 if (unlikely(!pg_vec))
1837 goto out; 2191 goto out;
1838 2192 }
1839 for (i = 0; i < req->tp_block_nr; i++) { 2193 /* Done */
1840 void *ptr = pg_vec[i]; 2194 else {
1841 int k; 2195 err = -EINVAL;
1842
1843 for (k = 0; k < po->frames_per_block; k++) {
1844 __packet_set_status(po, ptr, TP_STATUS_KERNEL);
1845 ptr += req->tp_frame_size;
1846 }
1847 }
1848 /* Done */
1849 } else {
1850 if (unlikely(req->tp_frame_nr)) 2196 if (unlikely(req->tp_frame_nr))
1851 return -EINVAL; 2197 goto out;
1852 } 2198 }
1853 2199
1854 lock_sock(sk); 2200 lock_sock(sk);
@@ -1872,23 +2218,24 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing
1872 if (closing || atomic_read(&po->mapped) == 0) { 2218 if (closing || atomic_read(&po->mapped) == 0) {
1873 err = 0; 2219 err = 0;
1874#define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; }) 2220#define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
1875 2221 spin_lock_bh(&rb_queue->lock);
1876 spin_lock_bh(&sk->sk_receive_queue.lock); 2222 pg_vec = XC(rb->pg_vec, pg_vec);
1877 pg_vec = XC(po->pg_vec, pg_vec); 2223 rb->frame_max = (req->tp_frame_nr - 1);
1878 po->frame_max = (req->tp_frame_nr - 1); 2224 rb->head = 0;
1879 po->head = 0; 2225 rb->frame_size = req->tp_frame_size;
1880 po->frame_size = req->tp_frame_size; 2226 spin_unlock_bh(&rb_queue->lock);
1881 spin_unlock_bh(&sk->sk_receive_queue.lock); 2227
1882 2228 order = XC(rb->pg_vec_order, order);
1883 order = XC(po->pg_vec_order, order); 2229 req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
1884 req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr); 2230
1885 2231 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
1886 po->pg_vec_pages = req->tp_block_size/PAGE_SIZE; 2232 po->prot_hook.func = (po->rx_ring.pg_vec) ?
1887 po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv; 2233 tpacket_rcv : packet_rcv;
1888 skb_queue_purge(&sk->sk_receive_queue); 2234 skb_queue_purge(rb_queue);
1889#undef XC 2235#undef XC
1890 if (atomic_read(&po->mapped)) 2236 if (atomic_read(&po->mapped))
1891 printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped)); 2237 printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n",
2238 atomic_read(&po->mapped));
1892 } 2239 }
1893 mutex_unlock(&po->pg_vec_lock); 2240 mutex_unlock(&po->pg_vec_lock);
1894 2241
@@ -1909,11 +2256,13 @@ out:
1909 return err; 2256 return err;
1910} 2257}
1911 2258
1912static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) 2259static int packet_mmap(struct file *file, struct socket *sock,
2260 struct vm_area_struct *vma)
1913{ 2261{
1914 struct sock *sk = sock->sk; 2262 struct sock *sk = sock->sk;
1915 struct packet_sock *po = pkt_sk(sk); 2263 struct packet_sock *po = pkt_sk(sk);
1916 unsigned long size; 2264 unsigned long size, expected_size;
2265 struct packet_ring_buffer *rb;
1917 unsigned long start; 2266 unsigned long start;
1918 int err = -EINVAL; 2267 int err = -EINVAL;
1919 int i; 2268 int i;
@@ -1921,26 +2270,43 @@ static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_st
1921 if (vma->vm_pgoff) 2270 if (vma->vm_pgoff)
1922 return -EINVAL; 2271 return -EINVAL;
1923 2272
1924 size = vma->vm_end - vma->vm_start;
1925
1926 mutex_lock(&po->pg_vec_lock); 2273 mutex_lock(&po->pg_vec_lock);
1927 if (po->pg_vec == NULL) 2274
2275 expected_size = 0;
2276 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2277 if (rb->pg_vec) {
2278 expected_size += rb->pg_vec_len
2279 * rb->pg_vec_pages
2280 * PAGE_SIZE;
2281 }
2282 }
2283
2284 if (expected_size == 0)
1928 goto out; 2285 goto out;
1929 if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE) 2286
2287 size = vma->vm_end - vma->vm_start;
2288 if (size != expected_size)
1930 goto out; 2289 goto out;
1931 2290
1932 start = vma->vm_start; 2291 start = vma->vm_start;
1933 for (i = 0; i < po->pg_vec_len; i++) { 2292 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
1934 struct page *page = virt_to_page(po->pg_vec[i]); 2293 if (rb->pg_vec == NULL)
1935 int pg_num; 2294 continue;
1936 2295
1937 for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) { 2296 for (i = 0; i < rb->pg_vec_len; i++) {
1938 err = vm_insert_page(vma, start, page); 2297 struct page *page = virt_to_page(rb->pg_vec[i]);
1939 if (unlikely(err)) 2298 int pg_num;
1940 goto out; 2299
1941 start += PAGE_SIZE; 2300 for (pg_num = 0; pg_num < rb->pg_vec_pages;
2301 pg_num++,page++) {
2302 err = vm_insert_page(vma, start, page);
2303 if (unlikely(err))
2304 goto out;
2305 start += PAGE_SIZE;
2306 }
1942 } 2307 }
1943 } 2308 }
2309
1944 atomic_inc(&po->mapped); 2310 atomic_inc(&po->mapped);
1945 vma->vm_ops = &packet_mmap_ops; 2311 vma->vm_ops = &packet_mmap_ops;
1946 err = 0; 2312 err = 0;