aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorJohann Baudy <johann.baudy@gnu-log.net>2009-05-19 01:11:22 -0400
committerDavid S. Miller <davem@davemloft.net>2009-05-19 01:11:22 -0400
commit69e3c75f4d541a6eb151b3ef91f34033cb3ad6e1 (patch)
tree24920f17ea435627978af9d5fe0e99763bf6a533 /net
parentf67f34084914144de55c785163d047d5d8dddd2d (diff)
net: TX_RING and packet mmap
New packet socket feature that makes packet socket more efficient for transmission. - It reduces number of system call through a PACKET_TX_RING mechanism, based on PACKET_RX_RING (Circular buffer allocated in kernel space which is mmapped from user space). - It minimizes CPU copy using fragmented SKB (almost zero copy). Signed-off-by: Johann Baudy <johann.baudy@gnu-log.net> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r--net/packet/af_packet.c588
1 files changed, 477 insertions, 111 deletions
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index f546e81acc45..766e6b41f7ca 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -39,6 +39,7 @@
39 * will simply extend the hardware address 39 * will simply extend the hardware address
40 * byte arrays at the end of sockaddr_ll 40 * byte arrays at the end of sockaddr_ll
41 * and packet_mreq. 41 * and packet_mreq.
42 * Johann Baudy : Added TX RING.
42 * 43 *
43 * This program is free software; you can redistribute it and/or 44 * This program is free software; you can redistribute it and/or
44 * modify it under the terms of the GNU General Public License 45 * modify it under the terms of the GNU General Public License
@@ -157,7 +158,25 @@ struct packet_mreq_max
157}; 158};
158 159
159#ifdef CONFIG_PACKET_MMAP 160#ifdef CONFIG_PACKET_MMAP
160static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing); 161static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
162 int closing, int tx_ring);
163
164struct packet_ring_buffer {
165 char * *pg_vec;
166 unsigned int head;
167 unsigned int frames_per_block;
168 unsigned int frame_size;
169 unsigned int frame_max;
170
171 unsigned int pg_vec_order;
172 unsigned int pg_vec_pages;
173 unsigned int pg_vec_len;
174
175 atomic_t pending;
176};
177
178struct packet_sock;
179static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
161#endif 180#endif
162 181
163static void packet_flush_mclist(struct sock *sk); 182static void packet_flush_mclist(struct sock *sk);
@@ -167,11 +186,8 @@ struct packet_sock {
167 struct sock sk; 186 struct sock sk;
168 struct tpacket_stats stats; 187 struct tpacket_stats stats;
169#ifdef CONFIG_PACKET_MMAP 188#ifdef CONFIG_PACKET_MMAP
170 char * *pg_vec; 189 struct packet_ring_buffer rx_ring;
171 unsigned int head; 190 struct packet_ring_buffer tx_ring;
172 unsigned int frames_per_block;
173 unsigned int frame_size;
174 unsigned int frame_max;
175 int copy_thresh; 191 int copy_thresh;
176#endif 192#endif
177 struct packet_type prot_hook; 193 struct packet_type prot_hook;
@@ -185,12 +201,10 @@ struct packet_sock {
185 struct packet_mclist *mclist; 201 struct packet_mclist *mclist;
186#ifdef CONFIG_PACKET_MMAP 202#ifdef CONFIG_PACKET_MMAP
187 atomic_t mapped; 203 atomic_t mapped;
188 unsigned int pg_vec_order;
189 unsigned int pg_vec_pages;
190 unsigned int pg_vec_len;
191 enum tpacket_versions tp_version; 204 enum tpacket_versions tp_version;
192 unsigned int tp_hdrlen; 205 unsigned int tp_hdrlen;
193 unsigned int tp_reserve; 206 unsigned int tp_reserve;
207 unsigned int tp_loss:1;
194#endif 208#endif
195}; 209};
196 210
@@ -206,36 +220,33 @@ struct packet_skb_cb {
206 220
207#ifdef CONFIG_PACKET_MMAP 221#ifdef CONFIG_PACKET_MMAP
208 222
209static void *packet_lookup_frame(struct packet_sock *po, unsigned int position, 223static void __packet_set_status(struct packet_sock *po, void *frame, int status)
210 int status)
211{ 224{
212 unsigned int pg_vec_pos, frame_offset;
213 union { 225 union {
214 struct tpacket_hdr *h1; 226 struct tpacket_hdr *h1;
215 struct tpacket2_hdr *h2; 227 struct tpacket2_hdr *h2;
216 void *raw; 228 void *raw;
217 } h; 229 } h;
218 230
219 pg_vec_pos = position / po->frames_per_block; 231 h.raw = frame;
220 frame_offset = position % po->frames_per_block;
221
222 h.raw = po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size);
223 switch (po->tp_version) { 232 switch (po->tp_version) {
224 case TPACKET_V1: 233 case TPACKET_V1:
225 if (status != (h.h1->tp_status ? TP_STATUS_USER : 234 h.h1->tp_status = status;
226 TP_STATUS_KERNEL)) 235 flush_dcache_page(virt_to_page(&h.h1->tp_status));
227 return NULL;
228 break; 236 break;
229 case TPACKET_V2: 237 case TPACKET_V2:
230 if (status != (h.h2->tp_status ? TP_STATUS_USER : 238 h.h2->tp_status = status;
231 TP_STATUS_KERNEL)) 239 flush_dcache_page(virt_to_page(&h.h2->tp_status));
232 return NULL;
233 break; 240 break;
241 default:
242 printk(KERN_ERR "TPACKET version not supported\n");
243 BUG();
234 } 244 }
235 return h.raw; 245
246 smp_wmb();
236} 247}
237 248
238static void __packet_set_status(struct packet_sock *po, void *frame, int status) 249static int __packet_get_status(struct packet_sock *po, void *frame)
239{ 250{
240 union { 251 union {
241 struct tpacket_hdr *h1; 252 struct tpacket_hdr *h1;
@@ -243,16 +254,66 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
243 void *raw; 254 void *raw;
244 } h; 255 } h;
245 256
257 smp_rmb();
258
246 h.raw = frame; 259 h.raw = frame;
247 switch (po->tp_version) { 260 switch (po->tp_version) {
248 case TPACKET_V1: 261 case TPACKET_V1:
249 h.h1->tp_status = status; 262 flush_dcache_page(virt_to_page(&h.h1->tp_status));
250 break; 263 return h.h1->tp_status;
251 case TPACKET_V2: 264 case TPACKET_V2:
252 h.h2->tp_status = status; 265 flush_dcache_page(virt_to_page(&h.h2->tp_status));
253 break; 266 return h.h2->tp_status;
267 default:
268 printk(KERN_ERR "TPACKET version not supported\n");
269 BUG();
270 return 0;
254 } 271 }
255} 272}
273
274static void *packet_lookup_frame(struct packet_sock *po,
275 struct packet_ring_buffer *rb,
276 unsigned int position,
277 int status)
278{
279 unsigned int pg_vec_pos, frame_offset;
280 union {
281 struct tpacket_hdr *h1;
282 struct tpacket2_hdr *h2;
283 void *raw;
284 } h;
285
286 pg_vec_pos = position / rb->frames_per_block;
287 frame_offset = position % rb->frames_per_block;
288
289 h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
290
291 if (status != __packet_get_status(po, h.raw))
292 return NULL;
293
294 return h.raw;
295}
296
297static inline void *packet_current_frame(struct packet_sock *po,
298 struct packet_ring_buffer *rb,
299 int status)
300{
301 return packet_lookup_frame(po, rb, rb->head, status);
302}
303
304static inline void *packet_previous_frame(struct packet_sock *po,
305 struct packet_ring_buffer *rb,
306 int status)
307{
308 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
309 return packet_lookup_frame(po, rb, previous, status);
310}
311
312static inline void packet_increment_head(struct packet_ring_buffer *buff)
313{
314 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
315}
316
256#endif 317#endif
257 318
258static inline struct packet_sock *pkt_sk(struct sock *sk) 319static inline struct packet_sock *pkt_sk(struct sock *sk)
@@ -648,7 +709,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
648 macoff = netoff - maclen; 709 macoff = netoff - maclen;
649 } 710 }
650 711
651 if (macoff + snaplen > po->frame_size) { 712 if (macoff + snaplen > po->rx_ring.frame_size) {
652 if (po->copy_thresh && 713 if (po->copy_thresh &&
653 atomic_read(&sk->sk_rmem_alloc) + skb->truesize < 714 atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
654 (unsigned)sk->sk_rcvbuf) { 715 (unsigned)sk->sk_rcvbuf) {
@@ -661,16 +722,16 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
661 if (copy_skb) 722 if (copy_skb)
662 skb_set_owner_r(copy_skb, sk); 723 skb_set_owner_r(copy_skb, sk);
663 } 724 }
664 snaplen = po->frame_size - macoff; 725 snaplen = po->rx_ring.frame_size - macoff;
665 if ((int)snaplen < 0) 726 if ((int)snaplen < 0)
666 snaplen = 0; 727 snaplen = 0;
667 } 728 }
668 729
669 spin_lock(&sk->sk_receive_queue.lock); 730 spin_lock(&sk->sk_receive_queue.lock);
670 h.raw = packet_lookup_frame(po, po->head, TP_STATUS_KERNEL); 731 h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
671 if (!h.raw) 732 if (!h.raw)
672 goto ring_is_full; 733 goto ring_is_full;
673 po->head = po->head != po->frame_max ? po->head+1 : 0; 734 packet_increment_head(&po->rx_ring);
674 po->stats.tp_packets++; 735 po->stats.tp_packets++;
675 if (copy_skb) { 736 if (copy_skb) {
676 status |= TP_STATUS_COPY; 737 status |= TP_STATUS_COPY;
@@ -727,7 +788,6 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
727 788
728 __packet_set_status(po, h.raw, status); 789 __packet_set_status(po, h.raw, status);
729 smp_mb(); 790 smp_mb();
730
731 { 791 {
732 struct page *p_start, *p_end; 792 struct page *p_start, *p_end;
733 u8 *h_end = h.raw + macoff + snaplen - 1; 793 u8 *h_end = h.raw + macoff + snaplen - 1;
@@ -760,10 +820,249 @@ ring_is_full:
760 goto drop_n_restore; 820 goto drop_n_restore;
761} 821}
762 822
763#endif 823static void tpacket_destruct_skb(struct sk_buff *skb)
824{
825 struct packet_sock *po = pkt_sk(skb->sk);
826 void * ph;
764 827
828 BUG_ON(skb == NULL);
765 829
766static int packet_sendmsg(struct kiocb *iocb, struct socket *sock, 830 if (likely(po->tx_ring.pg_vec)) {
831 ph = skb_shinfo(skb)->destructor_arg;
832 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
833 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
834 atomic_dec(&po->tx_ring.pending);
835 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
836 }
837
838 sock_wfree(skb);
839}
840
841static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff * skb,
842 void * frame, struct net_device *dev, int size_max,
843 __be16 proto, unsigned char * addr)
844{
845 union {
846 struct tpacket_hdr *h1;
847 struct tpacket2_hdr *h2;
848 void *raw;
849 } ph;
850 int to_write, offset, len, tp_len, nr_frags, len_max;
851 struct socket *sock = po->sk.sk_socket;
852 struct page *page;
853 void *data;
854 int err;
855
856 ph.raw = frame;
857
858 skb->protocol = proto;
859 skb->dev = dev;
860 skb->priority = po->sk.sk_priority;
861 skb_shinfo(skb)->destructor_arg = ph.raw;
862
863 switch (po->tp_version) {
864 case TPACKET_V2:
865 tp_len = ph.h2->tp_len;
866 break;
867 default:
868 tp_len = ph.h1->tp_len;
869 break;
870 }
871 if (unlikely(tp_len > size_max)) {
872 printk(KERN_ERR "packet size is too long (%d > %d)\n",
873 tp_len, size_max);
874 return -EMSGSIZE;
875 }
876
877 skb_reserve(skb, LL_RESERVED_SPACE(dev));
878 skb_reset_network_header(skb);
879
880 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
881 to_write = tp_len;
882
883 if (sock->type == SOCK_DGRAM) {
884 err = dev_hard_header(skb, dev, ntohs(proto), addr,
885 NULL, tp_len);
886 if (unlikely(err < 0))
887 return -EINVAL;
888 } else if (dev->hard_header_len ) {
889 /* net device doesn't like empty head */
890 if (unlikely(tp_len <= dev->hard_header_len)) {
891 printk(KERN_ERR "packet size is too short "
892 "(%d < %d)\n", tp_len,
893 dev->hard_header_len);
894 return -EINVAL;
895 }
896
897 skb_push(skb, dev->hard_header_len);
898 err = skb_store_bits(skb, 0, data,
899 dev->hard_header_len);
900 if (unlikely(err))
901 return err;
902
903 data += dev->hard_header_len;
904 to_write -= dev->hard_header_len;
905 }
906
907 err = -EFAULT;
908 page = virt_to_page(data);
909 offset = offset_in_page(data);
910 len_max = PAGE_SIZE - offset;
911 len = ((to_write > len_max) ? len_max : to_write);
912
913 skb->data_len = to_write;
914 skb->len += to_write;
915 skb->truesize += to_write;
916 atomic_add(to_write, &po->sk.sk_wmem_alloc);
917
918 while (likely(to_write)) {
919 nr_frags = skb_shinfo(skb)->nr_frags;
920
921 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
922 printk(KERN_ERR "Packet exceed the number "
923 "of skb frags(%lu)\n",
924 MAX_SKB_FRAGS);
925 return -EFAULT;
926 }
927
928 flush_dcache_page(page);
929 get_page(page);
930 skb_fill_page_desc(skb,
931 nr_frags,
932 page++, offset, len);
933 to_write -= len;
934 offset = 0;
935 len_max = PAGE_SIZE;
936 len = ((to_write > len_max) ? len_max : to_write);
937 }
938
939 return tp_len;
940}
941
942static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
943{
944 struct socket *sock;
945 struct sk_buff *skb;
946 struct net_device *dev;
947 __be16 proto;
948 int ifindex, err, reserve = 0;
949 void * ph;
950 struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
951 int tp_len, size_max;
952 unsigned char *addr;
953 int len_sum = 0;
954 int status = 0;
955
956 sock = po->sk.sk_socket;
957
958 mutex_lock(&po->pg_vec_lock);
959
960 err = -EBUSY;
961 if (saddr == NULL) {
962 ifindex = po->ifindex;
963 proto = po->num;
964 addr = NULL;
965 } else {
966 err = -EINVAL;
967 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
968 goto out;
969 if (msg->msg_namelen < (saddr->sll_halen
970 + offsetof(struct sockaddr_ll,
971 sll_addr)))
972 goto out;
973 ifindex = saddr->sll_ifindex;
974 proto = saddr->sll_protocol;
975 addr = saddr->sll_addr;
976 }
977
978 dev = dev_get_by_index(sock_net(&po->sk), ifindex);
979 err = -ENXIO;
980 if (unlikely(dev == NULL))
981 goto out;
982
983 reserve = dev->hard_header_len;
984
985 err = -ENETDOWN;
986 if (unlikely(!(dev->flags & IFF_UP)))
987 goto out_put;
988
989 size_max = po->tx_ring.frame_size
990 - sizeof(struct skb_shared_info)
991 - po->tp_hdrlen
992 - LL_ALLOCATED_SPACE(dev)
993 - sizeof(struct sockaddr_ll);
994
995 if (size_max > dev->mtu + reserve)
996 size_max = dev->mtu + reserve;
997
998 do {
999 ph = packet_current_frame(po, &po->tx_ring,
1000 TP_STATUS_SEND_REQUEST);
1001
1002 if (unlikely(ph == NULL)) {
1003 schedule();
1004 continue;
1005 }
1006
1007 status = TP_STATUS_SEND_REQUEST;
1008 skb = sock_alloc_send_skb(&po->sk,
1009 LL_ALLOCATED_SPACE(dev)
1010 + sizeof(struct sockaddr_ll),
1011 0, &err);
1012
1013 if (unlikely(skb == NULL))
1014 goto out_status;
1015
1016 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1017 addr);
1018
1019 if (unlikely(tp_len < 0)) {
1020 if (po->tp_loss) {
1021 __packet_set_status(po, ph,
1022 TP_STATUS_AVAILABLE);
1023 packet_increment_head(&po->tx_ring);
1024 kfree_skb(skb);
1025 continue;
1026 } else {
1027 status = TP_STATUS_WRONG_FORMAT;
1028 err = tp_len;
1029 goto out_status;
1030 }
1031 }
1032
1033 skb->destructor = tpacket_destruct_skb;
1034 __packet_set_status(po, ph, TP_STATUS_SENDING);
1035 atomic_inc(&po->tx_ring.pending);
1036
1037 status = TP_STATUS_SEND_REQUEST;
1038 err = dev_queue_xmit(skb);
1039 if (unlikely(err > 0 && (err = net_xmit_errno(err)) != 0))
1040 goto out_xmit;
1041 packet_increment_head(&po->tx_ring);
1042 len_sum += tp_len;
1043 }
1044 while (likely((ph != NULL) || ((!(msg->msg_flags & MSG_DONTWAIT))
1045 && (atomic_read(&po->tx_ring.pending))))
1046 );
1047
1048 err = len_sum;
1049 goto out_put;
1050
1051out_xmit:
1052 skb->destructor = sock_wfree;
1053 atomic_dec(&po->tx_ring.pending);
1054out_status:
1055 __packet_set_status(po, ph, status);
1056 kfree_skb(skb);
1057out_put:
1058 dev_put(dev);
1059out:
1060 mutex_unlock(&po->pg_vec_lock);
1061 return err;
1062}
1063#endif
1064
1065static int packet_snd(struct socket *sock,
767 struct msghdr *msg, size_t len) 1066 struct msghdr *msg, size_t len)
768{ 1067{
769 struct sock *sk = sock->sk; 1068 struct sock *sk = sock->sk;
@@ -854,6 +1153,19 @@ out:
854 return err; 1153 return err;
855} 1154}
856 1155
1156static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1157 struct msghdr *msg, size_t len)
1158{
1159#ifdef CONFIG_PACKET_MMAP
1160 struct sock *sk = sock->sk;
1161 struct packet_sock *po = pkt_sk(sk);
1162 if (po->tx_ring.pg_vec)
1163 return tpacket_snd(po, msg);
1164 else
1165#endif
1166 return packet_snd(sock, msg, len);
1167}
1168
857/* 1169/*
858 * Close a PACKET socket. This is fairly simple. We immediately go 1170 * Close a PACKET socket. This is fairly simple. We immediately go
859 * to 'closed' state and remove our protocol entry in the device list. 1171 * to 'closed' state and remove our protocol entry in the device list.
@@ -864,6 +1176,9 @@ static int packet_release(struct socket *sock)
864 struct sock *sk = sock->sk; 1176 struct sock *sk = sock->sk;
865 struct packet_sock *po; 1177 struct packet_sock *po;
866 struct net *net; 1178 struct net *net;
1179#ifdef CONFIG_PACKET_MMAP
1180 struct tpacket_req req;
1181#endif
867 1182
868 if (!sk) 1183 if (!sk)
869 return 0; 1184 return 0;
@@ -893,11 +1208,13 @@ static int packet_release(struct socket *sock)
893 packet_flush_mclist(sk); 1208 packet_flush_mclist(sk);
894 1209
895#ifdef CONFIG_PACKET_MMAP 1210#ifdef CONFIG_PACKET_MMAP
896 if (po->pg_vec) { 1211 memset(&req, 0, sizeof(req));
897 struct tpacket_req req; 1212
898 memset(&req, 0, sizeof(req)); 1213 if (po->rx_ring.pg_vec)
899 packet_set_ring(sk, &req, 1); 1214 packet_set_ring(sk, &req, 1, 0);
900 } 1215
1216 if (po->tx_ring.pg_vec)
1217 packet_set_ring(sk, &req, 1, 1);
901#endif 1218#endif
902 1219
903 /* 1220 /*
@@ -1391,7 +1708,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
1391 if (level != SOL_PACKET) 1708 if (level != SOL_PACKET)
1392 return -ENOPROTOOPT; 1709 return -ENOPROTOOPT;
1393 1710
1394 switch(optname) { 1711 switch (optname) {
1395 case PACKET_ADD_MEMBERSHIP: 1712 case PACKET_ADD_MEMBERSHIP:
1396 case PACKET_DROP_MEMBERSHIP: 1713 case PACKET_DROP_MEMBERSHIP:
1397 { 1714 {
@@ -1415,6 +1732,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
1415 1732
1416#ifdef CONFIG_PACKET_MMAP 1733#ifdef CONFIG_PACKET_MMAP
1417 case PACKET_RX_RING: 1734 case PACKET_RX_RING:
1735 case PACKET_TX_RING:
1418 { 1736 {
1419 struct tpacket_req req; 1737 struct tpacket_req req;
1420 1738
@@ -1422,7 +1740,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
1422 return -EINVAL; 1740 return -EINVAL;
1423 if (copy_from_user(&req,optval,sizeof(req))) 1741 if (copy_from_user(&req,optval,sizeof(req)))
1424 return -EFAULT; 1742 return -EFAULT;
1425 return packet_set_ring(sk, &req, 0); 1743 return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1426 } 1744 }
1427 case PACKET_COPY_THRESH: 1745 case PACKET_COPY_THRESH:
1428 { 1746 {
@@ -1442,7 +1760,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
1442 1760
1443 if (optlen != sizeof(val)) 1761 if (optlen != sizeof(val))
1444 return -EINVAL; 1762 return -EINVAL;
1445 if (po->pg_vec) 1763 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1446 return -EBUSY; 1764 return -EBUSY;
1447 if (copy_from_user(&val, optval, sizeof(val))) 1765 if (copy_from_user(&val, optval, sizeof(val)))
1448 return -EFAULT; 1766 return -EFAULT;
@@ -1461,13 +1779,26 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
1461 1779
1462 if (optlen != sizeof(val)) 1780 if (optlen != sizeof(val))
1463 return -EINVAL; 1781 return -EINVAL;
1464 if (po->pg_vec) 1782 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1465 return -EBUSY; 1783 return -EBUSY;
1466 if (copy_from_user(&val, optval, sizeof(val))) 1784 if (copy_from_user(&val, optval, sizeof(val)))
1467 return -EFAULT; 1785 return -EFAULT;
1468 po->tp_reserve = val; 1786 po->tp_reserve = val;
1469 return 0; 1787 return 0;
1470 } 1788 }
1789 case PACKET_LOSS:
1790 {
1791 unsigned int val;
1792
1793 if (optlen != sizeof(val))
1794 return -EINVAL;
1795 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1796 return -EBUSY;
1797 if (copy_from_user(&val, optval, sizeof(val)))
1798 return -EFAULT;
1799 po->tp_loss = !!val;
1800 return 0;
1801 }
1471#endif 1802#endif
1472 case PACKET_AUXDATA: 1803 case PACKET_AUXDATA:
1473 { 1804 {
@@ -1517,7 +1848,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
1517 if (len < 0) 1848 if (len < 0)
1518 return -EINVAL; 1849 return -EINVAL;
1519 1850
1520 switch(optname) { 1851 switch (optname) {
1521 case PACKET_STATISTICS: 1852 case PACKET_STATISTICS:
1522 if (len > sizeof(struct tpacket_stats)) 1853 if (len > sizeof(struct tpacket_stats))
1523 len = sizeof(struct tpacket_stats); 1854 len = sizeof(struct tpacket_stats);
@@ -1573,6 +1904,12 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
1573 val = po->tp_reserve; 1904 val = po->tp_reserve;
1574 data = &val; 1905 data = &val;
1575 break; 1906 break;
1907 case PACKET_LOSS:
1908 if (len > sizeof(unsigned int))
1909 len = sizeof(unsigned int);
1910 val = po->tp_loss;
1911 data = &val;
1912 break;
1576#endif 1913#endif
1577 default: 1914 default:
1578 return -ENOPROTOOPT; 1915 return -ENOPROTOOPT;
@@ -1643,7 +1980,7 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd,
1643{ 1980{
1644 struct sock *sk = sock->sk; 1981 struct sock *sk = sock->sk;
1645 1982
1646 switch(cmd) { 1983 switch (cmd) {
1647 case SIOCOUTQ: 1984 case SIOCOUTQ:
1648 { 1985 {
1649 int amount = atomic_read(&sk->sk_wmem_alloc); 1986 int amount = atomic_read(&sk->sk_wmem_alloc);
@@ -1705,13 +2042,17 @@ static unsigned int packet_poll(struct file * file, struct socket *sock,
1705 unsigned int mask = datagram_poll(file, sock, wait); 2042 unsigned int mask = datagram_poll(file, sock, wait);
1706 2043
1707 spin_lock_bh(&sk->sk_receive_queue.lock); 2044 spin_lock_bh(&sk->sk_receive_queue.lock);
1708 if (po->pg_vec) { 2045 if (po->rx_ring.pg_vec) {
1709 unsigned last = po->head ? po->head-1 : po->frame_max; 2046 if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
1710
1711 if (packet_lookup_frame(po, last, TP_STATUS_USER))
1712 mask |= POLLIN | POLLRDNORM; 2047 mask |= POLLIN | POLLRDNORM;
1713 } 2048 }
1714 spin_unlock_bh(&sk->sk_receive_queue.lock); 2049 spin_unlock_bh(&sk->sk_receive_queue.lock);
2050 spin_lock_bh(&sk->sk_write_queue.lock);
2051 if (po->tx_ring.pg_vec) {
2052 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2053 mask |= POLLOUT | POLLWRNORM;
2054 }
2055 spin_unlock_bh(&sk->sk_write_queue.lock);
1715 return mask; 2056 return mask;
1716} 2057}
1717 2058
@@ -1788,21 +2129,33 @@ out_free_pgvec:
1788 goto out; 2129 goto out;
1789} 2130}
1790 2131
1791static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing) 2132static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2133 int closing, int tx_ring)
1792{ 2134{
1793 char **pg_vec = NULL; 2135 char **pg_vec = NULL;
1794 struct packet_sock *po = pkt_sk(sk); 2136 struct packet_sock *po = pkt_sk(sk);
1795 int was_running, order = 0; 2137 int was_running, order = 0;
2138 struct packet_ring_buffer *rb;
2139 struct sk_buff_head *rb_queue;
1796 __be16 num; 2140 __be16 num;
1797 int err = 0; 2141 int err;
1798 2142
1799 if (req->tp_block_nr) { 2143 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
1800 int i; 2144 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1801 2145
1802 /* Sanity tests and some calculations */ 2146 err = -EBUSY;
2147 if (!closing) {
2148 if (atomic_read(&po->mapped))
2149 goto out;
2150 if (atomic_read(&rb->pending))
2151 goto out;
2152 }
1803 2153
1804 if (unlikely(po->pg_vec)) 2154 if (req->tp_block_nr) {
1805 return -EBUSY; 2155 /* Sanity tests and some calculations */
2156 err = -EBUSY;
2157 if (unlikely(rb->pg_vec))
2158 goto out;
1806 2159
1807 switch (po->tp_version) { 2160 switch (po->tp_version) {
1808 case TPACKET_V1: 2161 case TPACKET_V1:
@@ -1813,42 +2166,35 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing
1813 break; 2166 break;
1814 } 2167 }
1815 2168
2169 err = -EINVAL;
1816 if (unlikely((int)req->tp_block_size <= 0)) 2170 if (unlikely((int)req->tp_block_size <= 0))
1817 return -EINVAL; 2171 goto out;
1818 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1))) 2172 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
1819 return -EINVAL; 2173 goto out;
1820 if (unlikely(req->tp_frame_size < po->tp_hdrlen + 2174 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
1821 po->tp_reserve)) 2175 po->tp_reserve))
1822 return -EINVAL; 2176 goto out;
1823 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1))) 2177 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
1824 return -EINVAL; 2178 goto out;
1825 2179
1826 po->frames_per_block = req->tp_block_size/req->tp_frame_size; 2180 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
1827 if (unlikely(po->frames_per_block <= 0)) 2181 if (unlikely(rb->frames_per_block <= 0))
1828 return -EINVAL; 2182 goto out;
1829 if (unlikely((po->frames_per_block * req->tp_block_nr) != 2183 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
1830 req->tp_frame_nr)) 2184 req->tp_frame_nr))
1831 return -EINVAL; 2185 goto out;
1832 2186
1833 err = -ENOMEM; 2187 err = -ENOMEM;
1834 order = get_order(req->tp_block_size); 2188 order = get_order(req->tp_block_size);
1835 pg_vec = alloc_pg_vec(req, order); 2189 pg_vec = alloc_pg_vec(req, order);
1836 if (unlikely(!pg_vec)) 2190 if (unlikely(!pg_vec))
1837 goto out; 2191 goto out;
1838 2192 }
1839 for (i = 0; i < req->tp_block_nr; i++) { 2193 /* Done */
1840 void *ptr = pg_vec[i]; 2194 else {
1841 int k; 2195 err = -EINVAL;
1842
1843 for (k = 0; k < po->frames_per_block; k++) {
1844 __packet_set_status(po, ptr, TP_STATUS_KERNEL);
1845 ptr += req->tp_frame_size;
1846 }
1847 }
1848 /* Done */
1849 } else {
1850 if (unlikely(req->tp_frame_nr)) 2196 if (unlikely(req->tp_frame_nr))
1851 return -EINVAL; 2197 goto out;
1852 } 2198 }
1853 2199
1854 lock_sock(sk); 2200 lock_sock(sk);
@@ -1872,23 +2218,24 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing
1872 if (closing || atomic_read(&po->mapped) == 0) { 2218 if (closing || atomic_read(&po->mapped) == 0) {
1873 err = 0; 2219 err = 0;
1874#define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; }) 2220#define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
1875 2221 spin_lock_bh(&rb_queue->lock);
1876 spin_lock_bh(&sk->sk_receive_queue.lock); 2222 pg_vec = XC(rb->pg_vec, pg_vec);
1877 pg_vec = XC(po->pg_vec, pg_vec); 2223 rb->frame_max = (req->tp_frame_nr - 1);
1878 po->frame_max = (req->tp_frame_nr - 1); 2224 rb->head = 0;
1879 po->head = 0; 2225 rb->frame_size = req->tp_frame_size;
1880 po->frame_size = req->tp_frame_size; 2226 spin_unlock_bh(&rb_queue->lock);
1881 spin_unlock_bh(&sk->sk_receive_queue.lock); 2227
1882 2228 order = XC(rb->pg_vec_order, order);
1883 order = XC(po->pg_vec_order, order); 2229 req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
1884 req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr); 2230
1885 2231 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
1886 po->pg_vec_pages = req->tp_block_size/PAGE_SIZE; 2232 po->prot_hook.func = (po->rx_ring.pg_vec) ?
1887 po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv; 2233 tpacket_rcv : packet_rcv;
1888 skb_queue_purge(&sk->sk_receive_queue); 2234 skb_queue_purge(rb_queue);
1889#undef XC 2235#undef XC
1890 if (atomic_read(&po->mapped)) 2236 if (atomic_read(&po->mapped))
1891 printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped)); 2237 printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n",
2238 atomic_read(&po->mapped));
1892 } 2239 }
1893 mutex_unlock(&po->pg_vec_lock); 2240 mutex_unlock(&po->pg_vec_lock);
1894 2241
@@ -1909,11 +2256,13 @@ out:
1909 return err; 2256 return err;
1910} 2257}
1911 2258
1912static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) 2259static int packet_mmap(struct file *file, struct socket *sock,
2260 struct vm_area_struct *vma)
1913{ 2261{
1914 struct sock *sk = sock->sk; 2262 struct sock *sk = sock->sk;
1915 struct packet_sock *po = pkt_sk(sk); 2263 struct packet_sock *po = pkt_sk(sk);
1916 unsigned long size; 2264 unsigned long size, expected_size;
2265 struct packet_ring_buffer *rb;
1917 unsigned long start; 2266 unsigned long start;
1918 int err = -EINVAL; 2267 int err = -EINVAL;
1919 int i; 2268 int i;
@@ -1921,26 +2270,43 @@ static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_st
1921 if (vma->vm_pgoff) 2270 if (vma->vm_pgoff)
1922 return -EINVAL; 2271 return -EINVAL;
1923 2272
1924 size = vma->vm_end - vma->vm_start;
1925
1926 mutex_lock(&po->pg_vec_lock); 2273 mutex_lock(&po->pg_vec_lock);
1927 if (po->pg_vec == NULL) 2274
2275 expected_size = 0;
2276 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2277 if (rb->pg_vec) {
2278 expected_size += rb->pg_vec_len
2279 * rb->pg_vec_pages
2280 * PAGE_SIZE;
2281 }
2282 }
2283
2284 if (expected_size == 0)
1928 goto out; 2285 goto out;
1929 if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE) 2286
2287 size = vma->vm_end - vma->vm_start;
2288 if (size != expected_size)
1930 goto out; 2289 goto out;
1931 2290
1932 start = vma->vm_start; 2291 start = vma->vm_start;
1933 for (i = 0; i < po->pg_vec_len; i++) { 2292 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
1934 struct page *page = virt_to_page(po->pg_vec[i]); 2293 if (rb->pg_vec == NULL)
1935 int pg_num; 2294 continue;
1936 2295
1937 for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) { 2296 for (i = 0; i < rb->pg_vec_len; i++) {
1938 err = vm_insert_page(vma, start, page); 2297 struct page *page = virt_to_page(rb->pg_vec[i]);
1939 if (unlikely(err)) 2298 int pg_num;
1940 goto out; 2299
1941 start += PAGE_SIZE; 2300 for (pg_num = 0; pg_num < rb->pg_vec_pages;
2301 pg_num++,page++) {
2302 err = vm_insert_page(vma, start, page);
2303 if (unlikely(err))
2304 goto out;
2305 start += PAGE_SIZE;
2306 }
1942 } 2307 }
1943 } 2308 }
2309
1944 atomic_inc(&po->mapped); 2310 atomic_inc(&po->mapped);
1945 vma->vm_ops = &packet_mmap_ops; 2311 vma->vm_ops = &packet_mmap_ops;
1946 err = 0; 2312 err = 0;