aboutsummaryrefslogtreecommitdiffstats
path: root/net/packet/af_packet.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/packet/af_packet.c')
-rw-r--r--net/packet/af_packet.c604
1 files changed, 487 insertions, 117 deletions
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index f546e81acc45..4f76e5552d8e 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -39,6 +39,7 @@
39 * will simply extend the hardware address 39 * will simply extend the hardware address
40 * byte arrays at the end of sockaddr_ll 40 * byte arrays at the end of sockaddr_ll
41 * and packet_mreq. 41 * and packet_mreq.
42 * Johann Baudy : Added TX RING.
42 * 43 *
43 * This program is free software; you can redistribute it and/or 44 * This program is free software; you can redistribute it and/or
44 * modify it under the terms of the GNU General Public License 45 * modify it under the terms of the GNU General Public License
@@ -157,7 +158,25 @@ struct packet_mreq_max
157}; 158};
158 159
159#ifdef CONFIG_PACKET_MMAP 160#ifdef CONFIG_PACKET_MMAP
160static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing); 161static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
162 int closing, int tx_ring);
163
164struct packet_ring_buffer {
165 char * *pg_vec;
166 unsigned int head;
167 unsigned int frames_per_block;
168 unsigned int frame_size;
169 unsigned int frame_max;
170
171 unsigned int pg_vec_order;
172 unsigned int pg_vec_pages;
173 unsigned int pg_vec_len;
174
175 atomic_t pending;
176};
177
178struct packet_sock;
179static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
161#endif 180#endif
162 181
163static void packet_flush_mclist(struct sock *sk); 182static void packet_flush_mclist(struct sock *sk);
@@ -167,11 +186,8 @@ struct packet_sock {
167 struct sock sk; 186 struct sock sk;
168 struct tpacket_stats stats; 187 struct tpacket_stats stats;
169#ifdef CONFIG_PACKET_MMAP 188#ifdef CONFIG_PACKET_MMAP
170 char * *pg_vec; 189 struct packet_ring_buffer rx_ring;
171 unsigned int head; 190 struct packet_ring_buffer tx_ring;
172 unsigned int frames_per_block;
173 unsigned int frame_size;
174 unsigned int frame_max;
175 int copy_thresh; 191 int copy_thresh;
176#endif 192#endif
177 struct packet_type prot_hook; 193 struct packet_type prot_hook;
@@ -185,12 +201,10 @@ struct packet_sock {
185 struct packet_mclist *mclist; 201 struct packet_mclist *mclist;
186#ifdef CONFIG_PACKET_MMAP 202#ifdef CONFIG_PACKET_MMAP
187 atomic_t mapped; 203 atomic_t mapped;
188 unsigned int pg_vec_order;
189 unsigned int pg_vec_pages;
190 unsigned int pg_vec_len;
191 enum tpacket_versions tp_version; 204 enum tpacket_versions tp_version;
192 unsigned int tp_hdrlen; 205 unsigned int tp_hdrlen;
193 unsigned int tp_reserve; 206 unsigned int tp_reserve;
207 unsigned int tp_loss:1;
194#endif 208#endif
195}; 209};
196 210
@@ -206,36 +220,33 @@ struct packet_skb_cb {
206 220
207#ifdef CONFIG_PACKET_MMAP 221#ifdef CONFIG_PACKET_MMAP
208 222
209static void *packet_lookup_frame(struct packet_sock *po, unsigned int position, 223static void __packet_set_status(struct packet_sock *po, void *frame, int status)
210 int status)
211{ 224{
212 unsigned int pg_vec_pos, frame_offset;
213 union { 225 union {
214 struct tpacket_hdr *h1; 226 struct tpacket_hdr *h1;
215 struct tpacket2_hdr *h2; 227 struct tpacket2_hdr *h2;
216 void *raw; 228 void *raw;
217 } h; 229 } h;
218 230
219 pg_vec_pos = position / po->frames_per_block; 231 h.raw = frame;
220 frame_offset = position % po->frames_per_block;
221
222 h.raw = po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size);
223 switch (po->tp_version) { 232 switch (po->tp_version) {
224 case TPACKET_V1: 233 case TPACKET_V1:
225 if (status != (h.h1->tp_status ? TP_STATUS_USER : 234 h.h1->tp_status = status;
226 TP_STATUS_KERNEL)) 235 flush_dcache_page(virt_to_page(&h.h1->tp_status));
227 return NULL;
228 break; 236 break;
229 case TPACKET_V2: 237 case TPACKET_V2:
230 if (status != (h.h2->tp_status ? TP_STATUS_USER : 238 h.h2->tp_status = status;
231 TP_STATUS_KERNEL)) 239 flush_dcache_page(virt_to_page(&h.h2->tp_status));
232 return NULL;
233 break; 240 break;
241 default:
242 printk(KERN_ERR "TPACKET version not supported\n");
243 BUG();
234 } 244 }
235 return h.raw; 245
246 smp_wmb();
236} 247}
237 248
238static void __packet_set_status(struct packet_sock *po, void *frame, int status) 249static int __packet_get_status(struct packet_sock *po, void *frame)
239{ 250{
240 union { 251 union {
241 struct tpacket_hdr *h1; 252 struct tpacket_hdr *h1;
@@ -243,16 +254,66 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
243 void *raw; 254 void *raw;
244 } h; 255 } h;
245 256
257 smp_rmb();
258
246 h.raw = frame; 259 h.raw = frame;
247 switch (po->tp_version) { 260 switch (po->tp_version) {
248 case TPACKET_V1: 261 case TPACKET_V1:
249 h.h1->tp_status = status; 262 flush_dcache_page(virt_to_page(&h.h1->tp_status));
250 break; 263 return h.h1->tp_status;
251 case TPACKET_V2: 264 case TPACKET_V2:
252 h.h2->tp_status = status; 265 flush_dcache_page(virt_to_page(&h.h2->tp_status));
253 break; 266 return h.h2->tp_status;
267 default:
268 printk(KERN_ERR "TPACKET version not supported\n");
269 BUG();
270 return 0;
254 } 271 }
255} 272}
273
274static void *packet_lookup_frame(struct packet_sock *po,
275 struct packet_ring_buffer *rb,
276 unsigned int position,
277 int status)
278{
279 unsigned int pg_vec_pos, frame_offset;
280 union {
281 struct tpacket_hdr *h1;
282 struct tpacket2_hdr *h2;
283 void *raw;
284 } h;
285
286 pg_vec_pos = position / rb->frames_per_block;
287 frame_offset = position % rb->frames_per_block;
288
289 h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
290
291 if (status != __packet_get_status(po, h.raw))
292 return NULL;
293
294 return h.raw;
295}
296
297static inline void *packet_current_frame(struct packet_sock *po,
298 struct packet_ring_buffer *rb,
299 int status)
300{
301 return packet_lookup_frame(po, rb, rb->head, status);
302}
303
304static inline void *packet_previous_frame(struct packet_sock *po,
305 struct packet_ring_buffer *rb,
306 int status)
307{
308 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
309 return packet_lookup_frame(po, rb, previous, status);
310}
311
312static inline void packet_increment_head(struct packet_ring_buffer *buff)
313{
314 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
315}
316
256#endif 317#endif
257 318
258static inline struct packet_sock *pkt_sk(struct sock *sk) 319static inline struct packet_sock *pkt_sk(struct sock *sk)
@@ -311,8 +372,7 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct
311 goto oom; 372 goto oom;
312 373
313 /* drop any routing info */ 374 /* drop any routing info */
314 dst_release(skb->dst); 375 skb_dst_drop(skb);
315 skb->dst = NULL;
316 376
317 /* drop conntrack reference */ 377 /* drop conntrack reference */
318 nf_reset(skb); 378 nf_reset(skb);
@@ -560,8 +620,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet
560 620
561 skb_set_owner_r(skb, sk); 621 skb_set_owner_r(skb, sk);
562 skb->dev = NULL; 622 skb->dev = NULL;
563 dst_release(skb->dst); 623 skb_dst_drop(skb);
564 skb->dst = NULL;
565 624
566 /* drop conntrack reference */ 625 /* drop conntrack reference */
567 nf_reset(skb); 626 nf_reset(skb);
@@ -648,7 +707,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
648 macoff = netoff - maclen; 707 macoff = netoff - maclen;
649 } 708 }
650 709
651 if (macoff + snaplen > po->frame_size) { 710 if (macoff + snaplen > po->rx_ring.frame_size) {
652 if (po->copy_thresh && 711 if (po->copy_thresh &&
653 atomic_read(&sk->sk_rmem_alloc) + skb->truesize < 712 atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
654 (unsigned)sk->sk_rcvbuf) { 713 (unsigned)sk->sk_rcvbuf) {
@@ -661,16 +720,16 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
661 if (copy_skb) 720 if (copy_skb)
662 skb_set_owner_r(copy_skb, sk); 721 skb_set_owner_r(copy_skb, sk);
663 } 722 }
664 snaplen = po->frame_size - macoff; 723 snaplen = po->rx_ring.frame_size - macoff;
665 if ((int)snaplen < 0) 724 if ((int)snaplen < 0)
666 snaplen = 0; 725 snaplen = 0;
667 } 726 }
668 727
669 spin_lock(&sk->sk_receive_queue.lock); 728 spin_lock(&sk->sk_receive_queue.lock);
670 h.raw = packet_lookup_frame(po, po->head, TP_STATUS_KERNEL); 729 h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
671 if (!h.raw) 730 if (!h.raw)
672 goto ring_is_full; 731 goto ring_is_full;
673 po->head = po->head != po->frame_max ? po->head+1 : 0; 732 packet_increment_head(&po->rx_ring);
674 po->stats.tp_packets++; 733 po->stats.tp_packets++;
675 if (copy_skb) { 734 if (copy_skb) {
676 status |= TP_STATUS_COPY; 735 status |= TP_STATUS_COPY;
@@ -727,7 +786,6 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
727 786
728 __packet_set_status(po, h.raw, status); 787 __packet_set_status(po, h.raw, status);
729 smp_mb(); 788 smp_mb();
730
731 { 789 {
732 struct page *p_start, *p_end; 790 struct page *p_start, *p_end;
733 u8 *h_end = h.raw + macoff + snaplen - 1; 791 u8 *h_end = h.raw + macoff + snaplen - 1;
@@ -760,10 +818,249 @@ ring_is_full:
760 goto drop_n_restore; 818 goto drop_n_restore;
761} 819}
762 820
763#endif 821static void tpacket_destruct_skb(struct sk_buff *skb)
822{
823 struct packet_sock *po = pkt_sk(skb->sk);
824 void * ph;
764 825
826 BUG_ON(skb == NULL);
765 827
766static int packet_sendmsg(struct kiocb *iocb, struct socket *sock, 828 if (likely(po->tx_ring.pg_vec)) {
829 ph = skb_shinfo(skb)->destructor_arg;
830 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
831 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
832 atomic_dec(&po->tx_ring.pending);
833 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
834 }
835
836 sock_wfree(skb);
837}
838
839static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff * skb,
840 void * frame, struct net_device *dev, int size_max,
841 __be16 proto, unsigned char * addr)
842{
843 union {
844 struct tpacket_hdr *h1;
845 struct tpacket2_hdr *h2;
846 void *raw;
847 } ph;
848 int to_write, offset, len, tp_len, nr_frags, len_max;
849 struct socket *sock = po->sk.sk_socket;
850 struct page *page;
851 void *data;
852 int err;
853
854 ph.raw = frame;
855
856 skb->protocol = proto;
857 skb->dev = dev;
858 skb->priority = po->sk.sk_priority;
859 skb_shinfo(skb)->destructor_arg = ph.raw;
860
861 switch (po->tp_version) {
862 case TPACKET_V2:
863 tp_len = ph.h2->tp_len;
864 break;
865 default:
866 tp_len = ph.h1->tp_len;
867 break;
868 }
869 if (unlikely(tp_len > size_max)) {
870 printk(KERN_ERR "packet size is too long (%d > %d)\n",
871 tp_len, size_max);
872 return -EMSGSIZE;
873 }
874
875 skb_reserve(skb, LL_RESERVED_SPACE(dev));
876 skb_reset_network_header(skb);
877
878 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
879 to_write = tp_len;
880
881 if (sock->type == SOCK_DGRAM) {
882 err = dev_hard_header(skb, dev, ntohs(proto), addr,
883 NULL, tp_len);
884 if (unlikely(err < 0))
885 return -EINVAL;
886 } else if (dev->hard_header_len ) {
887 /* net device doesn't like empty head */
888 if (unlikely(tp_len <= dev->hard_header_len)) {
889 printk(KERN_ERR "packet size is too short "
890 "(%d < %d)\n", tp_len,
891 dev->hard_header_len);
892 return -EINVAL;
893 }
894
895 skb_push(skb, dev->hard_header_len);
896 err = skb_store_bits(skb, 0, data,
897 dev->hard_header_len);
898 if (unlikely(err))
899 return err;
900
901 data += dev->hard_header_len;
902 to_write -= dev->hard_header_len;
903 }
904
905 err = -EFAULT;
906 page = virt_to_page(data);
907 offset = offset_in_page(data);
908 len_max = PAGE_SIZE - offset;
909 len = ((to_write > len_max) ? len_max : to_write);
910
911 skb->data_len = to_write;
912 skb->len += to_write;
913 skb->truesize += to_write;
914 atomic_add(to_write, &po->sk.sk_wmem_alloc);
915
916 while (likely(to_write)) {
917 nr_frags = skb_shinfo(skb)->nr_frags;
918
919 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
920 printk(KERN_ERR "Packet exceed the number "
921 "of skb frags(%lu)\n",
922 MAX_SKB_FRAGS);
923 return -EFAULT;
924 }
925
926 flush_dcache_page(page);
927 get_page(page);
928 skb_fill_page_desc(skb,
929 nr_frags,
930 page++, offset, len);
931 to_write -= len;
932 offset = 0;
933 len_max = PAGE_SIZE;
934 len = ((to_write > len_max) ? len_max : to_write);
935 }
936
937 return tp_len;
938}
939
940static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
941{
942 struct socket *sock;
943 struct sk_buff *skb;
944 struct net_device *dev;
945 __be16 proto;
946 int ifindex, err, reserve = 0;
947 void * ph;
948 struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
949 int tp_len, size_max;
950 unsigned char *addr;
951 int len_sum = 0;
952 int status = 0;
953
954 sock = po->sk.sk_socket;
955
956 mutex_lock(&po->pg_vec_lock);
957
958 err = -EBUSY;
959 if (saddr == NULL) {
960 ifindex = po->ifindex;
961 proto = po->num;
962 addr = NULL;
963 } else {
964 err = -EINVAL;
965 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
966 goto out;
967 if (msg->msg_namelen < (saddr->sll_halen
968 + offsetof(struct sockaddr_ll,
969 sll_addr)))
970 goto out;
971 ifindex = saddr->sll_ifindex;
972 proto = saddr->sll_protocol;
973 addr = saddr->sll_addr;
974 }
975
976 dev = dev_get_by_index(sock_net(&po->sk), ifindex);
977 err = -ENXIO;
978 if (unlikely(dev == NULL))
979 goto out;
980
981 reserve = dev->hard_header_len;
982
983 err = -ENETDOWN;
984 if (unlikely(!(dev->flags & IFF_UP)))
985 goto out_put;
986
987 size_max = po->tx_ring.frame_size
988 - sizeof(struct skb_shared_info)
989 - po->tp_hdrlen
990 - LL_ALLOCATED_SPACE(dev)
991 - sizeof(struct sockaddr_ll);
992
993 if (size_max > dev->mtu + reserve)
994 size_max = dev->mtu + reserve;
995
996 do {
997 ph = packet_current_frame(po, &po->tx_ring,
998 TP_STATUS_SEND_REQUEST);
999
1000 if (unlikely(ph == NULL)) {
1001 schedule();
1002 continue;
1003 }
1004
1005 status = TP_STATUS_SEND_REQUEST;
1006 skb = sock_alloc_send_skb(&po->sk,
1007 LL_ALLOCATED_SPACE(dev)
1008 + sizeof(struct sockaddr_ll),
1009 0, &err);
1010
1011 if (unlikely(skb == NULL))
1012 goto out_status;
1013
1014 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1015 addr);
1016
1017 if (unlikely(tp_len < 0)) {
1018 if (po->tp_loss) {
1019 __packet_set_status(po, ph,
1020 TP_STATUS_AVAILABLE);
1021 packet_increment_head(&po->tx_ring);
1022 kfree_skb(skb);
1023 continue;
1024 } else {
1025 status = TP_STATUS_WRONG_FORMAT;
1026 err = tp_len;
1027 goto out_status;
1028 }
1029 }
1030
1031 skb->destructor = tpacket_destruct_skb;
1032 __packet_set_status(po, ph, TP_STATUS_SENDING);
1033 atomic_inc(&po->tx_ring.pending);
1034
1035 status = TP_STATUS_SEND_REQUEST;
1036 err = dev_queue_xmit(skb);
1037 if (unlikely(err > 0 && (err = net_xmit_errno(err)) != 0))
1038 goto out_xmit;
1039 packet_increment_head(&po->tx_ring);
1040 len_sum += tp_len;
1041 }
1042 while (likely((ph != NULL) || ((!(msg->msg_flags & MSG_DONTWAIT))
1043 && (atomic_read(&po->tx_ring.pending))))
1044 );
1045
1046 err = len_sum;
1047 goto out_put;
1048
1049out_xmit:
1050 skb->destructor = sock_wfree;
1051 atomic_dec(&po->tx_ring.pending);
1052out_status:
1053 __packet_set_status(po, ph, status);
1054 kfree_skb(skb);
1055out_put:
1056 dev_put(dev);
1057out:
1058 mutex_unlock(&po->pg_vec_lock);
1059 return err;
1060}
1061#endif
1062
1063static int packet_snd(struct socket *sock,
767 struct msghdr *msg, size_t len) 1064 struct msghdr *msg, size_t len)
768{ 1065{
769 struct sock *sk = sock->sk; 1066 struct sock *sk = sock->sk;
@@ -854,6 +1151,19 @@ out:
854 return err; 1151 return err;
855} 1152}
856 1153
1154static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1155 struct msghdr *msg, size_t len)
1156{
1157#ifdef CONFIG_PACKET_MMAP
1158 struct sock *sk = sock->sk;
1159 struct packet_sock *po = pkt_sk(sk);
1160 if (po->tx_ring.pg_vec)
1161 return tpacket_snd(po, msg);
1162 else
1163#endif
1164 return packet_snd(sock, msg, len);
1165}
1166
857/* 1167/*
858 * Close a PACKET socket. This is fairly simple. We immediately go 1168 * Close a PACKET socket. This is fairly simple. We immediately go
859 * to 'closed' state and remove our protocol entry in the device list. 1169 * to 'closed' state and remove our protocol entry in the device list.
@@ -864,6 +1174,9 @@ static int packet_release(struct socket *sock)
864 struct sock *sk = sock->sk; 1174 struct sock *sk = sock->sk;
865 struct packet_sock *po; 1175 struct packet_sock *po;
866 struct net *net; 1176 struct net *net;
1177#ifdef CONFIG_PACKET_MMAP
1178 struct tpacket_req req;
1179#endif
867 1180
868 if (!sk) 1181 if (!sk)
869 return 0; 1182 return 0;
@@ -893,11 +1206,13 @@ static int packet_release(struct socket *sock)
893 packet_flush_mclist(sk); 1206 packet_flush_mclist(sk);
894 1207
895#ifdef CONFIG_PACKET_MMAP 1208#ifdef CONFIG_PACKET_MMAP
896 if (po->pg_vec) { 1209 memset(&req, 0, sizeof(req));
897 struct tpacket_req req; 1210
898 memset(&req, 0, sizeof(req)); 1211 if (po->rx_ring.pg_vec)
899 packet_set_ring(sk, &req, 1); 1212 packet_set_ring(sk, &req, 1, 0);
900 } 1213
1214 if (po->tx_ring.pg_vec)
1215 packet_set_ring(sk, &req, 1, 1);
901#endif 1216#endif
902 1217
903 /* 1218 /*
@@ -1253,9 +1568,9 @@ static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1253 switch (i->type) { 1568 switch (i->type) {
1254 case PACKET_MR_MULTICAST: 1569 case PACKET_MR_MULTICAST:
1255 if (what > 0) 1570 if (what > 0)
1256 dev_mc_add(dev, i->addr, i->alen, 0); 1571 return dev_mc_add(dev, i->addr, i->alen, 0);
1257 else 1572 else
1258 dev_mc_delete(dev, i->addr, i->alen, 0); 1573 return dev_mc_delete(dev, i->addr, i->alen, 0);
1259 break; 1574 break;
1260 case PACKET_MR_PROMISC: 1575 case PACKET_MR_PROMISC:
1261 return dev_set_promiscuity(dev, what); 1576 return dev_set_promiscuity(dev, what);
@@ -1263,6 +1578,12 @@ static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1263 case PACKET_MR_ALLMULTI: 1578 case PACKET_MR_ALLMULTI:
1264 return dev_set_allmulti(dev, what); 1579 return dev_set_allmulti(dev, what);
1265 break; 1580 break;
1581 case PACKET_MR_UNICAST:
1582 if (what > 0)
1583 return dev_unicast_add(dev, i->addr);
1584 else
1585 return dev_unicast_delete(dev, i->addr);
1586 break;
1266 default:; 1587 default:;
1267 } 1588 }
1268 return 0; 1589 return 0;
@@ -1391,7 +1712,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
1391 if (level != SOL_PACKET) 1712 if (level != SOL_PACKET)
1392 return -ENOPROTOOPT; 1713 return -ENOPROTOOPT;
1393 1714
1394 switch(optname) { 1715 switch (optname) {
1395 case PACKET_ADD_MEMBERSHIP: 1716 case PACKET_ADD_MEMBERSHIP:
1396 case PACKET_DROP_MEMBERSHIP: 1717 case PACKET_DROP_MEMBERSHIP:
1397 { 1718 {
@@ -1415,6 +1736,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
1415 1736
1416#ifdef CONFIG_PACKET_MMAP 1737#ifdef CONFIG_PACKET_MMAP
1417 case PACKET_RX_RING: 1738 case PACKET_RX_RING:
1739 case PACKET_TX_RING:
1418 { 1740 {
1419 struct tpacket_req req; 1741 struct tpacket_req req;
1420 1742
@@ -1422,7 +1744,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
1422 return -EINVAL; 1744 return -EINVAL;
1423 if (copy_from_user(&req,optval,sizeof(req))) 1745 if (copy_from_user(&req,optval,sizeof(req)))
1424 return -EFAULT; 1746 return -EFAULT;
1425 return packet_set_ring(sk, &req, 0); 1747 return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1426 } 1748 }
1427 case PACKET_COPY_THRESH: 1749 case PACKET_COPY_THRESH:
1428 { 1750 {
@@ -1442,7 +1764,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
1442 1764
1443 if (optlen != sizeof(val)) 1765 if (optlen != sizeof(val))
1444 return -EINVAL; 1766 return -EINVAL;
1445 if (po->pg_vec) 1767 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1446 return -EBUSY; 1768 return -EBUSY;
1447 if (copy_from_user(&val, optval, sizeof(val))) 1769 if (copy_from_user(&val, optval, sizeof(val)))
1448 return -EFAULT; 1770 return -EFAULT;
@@ -1461,13 +1783,26 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
1461 1783
1462 if (optlen != sizeof(val)) 1784 if (optlen != sizeof(val))
1463 return -EINVAL; 1785 return -EINVAL;
1464 if (po->pg_vec) 1786 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1465 return -EBUSY; 1787 return -EBUSY;
1466 if (copy_from_user(&val, optval, sizeof(val))) 1788 if (copy_from_user(&val, optval, sizeof(val)))
1467 return -EFAULT; 1789 return -EFAULT;
1468 po->tp_reserve = val; 1790 po->tp_reserve = val;
1469 return 0; 1791 return 0;
1470 } 1792 }
1793 case PACKET_LOSS:
1794 {
1795 unsigned int val;
1796
1797 if (optlen != sizeof(val))
1798 return -EINVAL;
1799 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1800 return -EBUSY;
1801 if (copy_from_user(&val, optval, sizeof(val)))
1802 return -EFAULT;
1803 po->tp_loss = !!val;
1804 return 0;
1805 }
1471#endif 1806#endif
1472 case PACKET_AUXDATA: 1807 case PACKET_AUXDATA:
1473 { 1808 {
@@ -1517,7 +1852,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
1517 if (len < 0) 1852 if (len < 0)
1518 return -EINVAL; 1853 return -EINVAL;
1519 1854
1520 switch(optname) { 1855 switch (optname) {
1521 case PACKET_STATISTICS: 1856 case PACKET_STATISTICS:
1522 if (len > sizeof(struct tpacket_stats)) 1857 if (len > sizeof(struct tpacket_stats))
1523 len = sizeof(struct tpacket_stats); 1858 len = sizeof(struct tpacket_stats);
@@ -1573,6 +1908,12 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
1573 val = po->tp_reserve; 1908 val = po->tp_reserve;
1574 data = &val; 1909 data = &val;
1575 break; 1910 break;
1911 case PACKET_LOSS:
1912 if (len > sizeof(unsigned int))
1913 len = sizeof(unsigned int);
1914 val = po->tp_loss;
1915 data = &val;
1916 break;
1576#endif 1917#endif
1577 default: 1918 default:
1578 return -ENOPROTOOPT; 1919 return -ENOPROTOOPT;
@@ -1643,7 +1984,7 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd,
1643{ 1984{
1644 struct sock *sk = sock->sk; 1985 struct sock *sk = sock->sk;
1645 1986
1646 switch(cmd) { 1987 switch (cmd) {
1647 case SIOCOUTQ: 1988 case SIOCOUTQ:
1648 { 1989 {
1649 int amount = atomic_read(&sk->sk_wmem_alloc); 1990 int amount = atomic_read(&sk->sk_wmem_alloc);
@@ -1705,13 +2046,17 @@ static unsigned int packet_poll(struct file * file, struct socket *sock,
1705 unsigned int mask = datagram_poll(file, sock, wait); 2046 unsigned int mask = datagram_poll(file, sock, wait);
1706 2047
1707 spin_lock_bh(&sk->sk_receive_queue.lock); 2048 spin_lock_bh(&sk->sk_receive_queue.lock);
1708 if (po->pg_vec) { 2049 if (po->rx_ring.pg_vec) {
1709 unsigned last = po->head ? po->head-1 : po->frame_max; 2050 if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
1710
1711 if (packet_lookup_frame(po, last, TP_STATUS_USER))
1712 mask |= POLLIN | POLLRDNORM; 2051 mask |= POLLIN | POLLRDNORM;
1713 } 2052 }
1714 spin_unlock_bh(&sk->sk_receive_queue.lock); 2053 spin_unlock_bh(&sk->sk_receive_queue.lock);
2054 spin_lock_bh(&sk->sk_write_queue.lock);
2055 if (po->tx_ring.pg_vec) {
2056 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2057 mask |= POLLOUT | POLLWRNORM;
2058 }
2059 spin_unlock_bh(&sk->sk_write_queue.lock);
1715 return mask; 2060 return mask;
1716} 2061}
1717 2062
@@ -1788,21 +2133,33 @@ out_free_pgvec:
1788 goto out; 2133 goto out;
1789} 2134}
1790 2135
1791static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing) 2136static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2137 int closing, int tx_ring)
1792{ 2138{
1793 char **pg_vec = NULL; 2139 char **pg_vec = NULL;
1794 struct packet_sock *po = pkt_sk(sk); 2140 struct packet_sock *po = pkt_sk(sk);
1795 int was_running, order = 0; 2141 int was_running, order = 0;
2142 struct packet_ring_buffer *rb;
2143 struct sk_buff_head *rb_queue;
1796 __be16 num; 2144 __be16 num;
1797 int err = 0; 2145 int err;
1798 2146
1799 if (req->tp_block_nr) { 2147 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
1800 int i; 2148 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1801 2149
1802 /* Sanity tests and some calculations */ 2150 err = -EBUSY;
2151 if (!closing) {
2152 if (atomic_read(&po->mapped))
2153 goto out;
2154 if (atomic_read(&rb->pending))
2155 goto out;
2156 }
1803 2157
1804 if (unlikely(po->pg_vec)) 2158 if (req->tp_block_nr) {
1805 return -EBUSY; 2159 /* Sanity tests and some calculations */
2160 err = -EBUSY;
2161 if (unlikely(rb->pg_vec))
2162 goto out;
1806 2163
1807 switch (po->tp_version) { 2164 switch (po->tp_version) {
1808 case TPACKET_V1: 2165 case TPACKET_V1:
@@ -1813,42 +2170,35 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing
1813 break; 2170 break;
1814 } 2171 }
1815 2172
2173 err = -EINVAL;
1816 if (unlikely((int)req->tp_block_size <= 0)) 2174 if (unlikely((int)req->tp_block_size <= 0))
1817 return -EINVAL; 2175 goto out;
1818 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1))) 2176 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
1819 return -EINVAL; 2177 goto out;
1820 if (unlikely(req->tp_frame_size < po->tp_hdrlen + 2178 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
1821 po->tp_reserve)) 2179 po->tp_reserve))
1822 return -EINVAL; 2180 goto out;
1823 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1))) 2181 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
1824 return -EINVAL; 2182 goto out;
1825 2183
1826 po->frames_per_block = req->tp_block_size/req->tp_frame_size; 2184 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
1827 if (unlikely(po->frames_per_block <= 0)) 2185 if (unlikely(rb->frames_per_block <= 0))
1828 return -EINVAL; 2186 goto out;
1829 if (unlikely((po->frames_per_block * req->tp_block_nr) != 2187 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
1830 req->tp_frame_nr)) 2188 req->tp_frame_nr))
1831 return -EINVAL; 2189 goto out;
1832 2190
1833 err = -ENOMEM; 2191 err = -ENOMEM;
1834 order = get_order(req->tp_block_size); 2192 order = get_order(req->tp_block_size);
1835 pg_vec = alloc_pg_vec(req, order); 2193 pg_vec = alloc_pg_vec(req, order);
1836 if (unlikely(!pg_vec)) 2194 if (unlikely(!pg_vec))
1837 goto out; 2195 goto out;
1838 2196 }
1839 for (i = 0; i < req->tp_block_nr; i++) { 2197 /* Done */
1840 void *ptr = pg_vec[i]; 2198 else {
1841 int k; 2199 err = -EINVAL;
1842
1843 for (k = 0; k < po->frames_per_block; k++) {
1844 __packet_set_status(po, ptr, TP_STATUS_KERNEL);
1845 ptr += req->tp_frame_size;
1846 }
1847 }
1848 /* Done */
1849 } else {
1850 if (unlikely(req->tp_frame_nr)) 2200 if (unlikely(req->tp_frame_nr))
1851 return -EINVAL; 2201 goto out;
1852 } 2202 }
1853 2203
1854 lock_sock(sk); 2204 lock_sock(sk);
@@ -1872,23 +2222,24 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing
1872 if (closing || atomic_read(&po->mapped) == 0) { 2222 if (closing || atomic_read(&po->mapped) == 0) {
1873 err = 0; 2223 err = 0;
1874#define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; }) 2224#define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
1875 2225 spin_lock_bh(&rb_queue->lock);
1876 spin_lock_bh(&sk->sk_receive_queue.lock); 2226 pg_vec = XC(rb->pg_vec, pg_vec);
1877 pg_vec = XC(po->pg_vec, pg_vec); 2227 rb->frame_max = (req->tp_frame_nr - 1);
1878 po->frame_max = (req->tp_frame_nr - 1); 2228 rb->head = 0;
1879 po->head = 0; 2229 rb->frame_size = req->tp_frame_size;
1880 po->frame_size = req->tp_frame_size; 2230 spin_unlock_bh(&rb_queue->lock);
1881 spin_unlock_bh(&sk->sk_receive_queue.lock); 2231
1882 2232 order = XC(rb->pg_vec_order, order);
1883 order = XC(po->pg_vec_order, order); 2233 req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
1884 req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr); 2234
1885 2235 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
1886 po->pg_vec_pages = req->tp_block_size/PAGE_SIZE; 2236 po->prot_hook.func = (po->rx_ring.pg_vec) ?
1887 po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv; 2237 tpacket_rcv : packet_rcv;
1888 skb_queue_purge(&sk->sk_receive_queue); 2238 skb_queue_purge(rb_queue);
1889#undef XC 2239#undef XC
1890 if (atomic_read(&po->mapped)) 2240 if (atomic_read(&po->mapped))
1891 printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped)); 2241 printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n",
2242 atomic_read(&po->mapped));
1892 } 2243 }
1893 mutex_unlock(&po->pg_vec_lock); 2244 mutex_unlock(&po->pg_vec_lock);
1894 2245
@@ -1909,11 +2260,13 @@ out:
1909 return err; 2260 return err;
1910} 2261}
1911 2262
1912static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) 2263static int packet_mmap(struct file *file, struct socket *sock,
2264 struct vm_area_struct *vma)
1913{ 2265{
1914 struct sock *sk = sock->sk; 2266 struct sock *sk = sock->sk;
1915 struct packet_sock *po = pkt_sk(sk); 2267 struct packet_sock *po = pkt_sk(sk);
1916 unsigned long size; 2268 unsigned long size, expected_size;
2269 struct packet_ring_buffer *rb;
1917 unsigned long start; 2270 unsigned long start;
1918 int err = -EINVAL; 2271 int err = -EINVAL;
1919 int i; 2272 int i;
@@ -1921,26 +2274,43 @@ static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_st
1921 if (vma->vm_pgoff) 2274 if (vma->vm_pgoff)
1922 return -EINVAL; 2275 return -EINVAL;
1923 2276
1924 size = vma->vm_end - vma->vm_start;
1925
1926 mutex_lock(&po->pg_vec_lock); 2277 mutex_lock(&po->pg_vec_lock);
1927 if (po->pg_vec == NULL) 2278
2279 expected_size = 0;
2280 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2281 if (rb->pg_vec) {
2282 expected_size += rb->pg_vec_len
2283 * rb->pg_vec_pages
2284 * PAGE_SIZE;
2285 }
2286 }
2287
2288 if (expected_size == 0)
1928 goto out; 2289 goto out;
1929 if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE) 2290
2291 size = vma->vm_end - vma->vm_start;
2292 if (size != expected_size)
1930 goto out; 2293 goto out;
1931 2294
1932 start = vma->vm_start; 2295 start = vma->vm_start;
1933 for (i = 0; i < po->pg_vec_len; i++) { 2296 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
1934 struct page *page = virt_to_page(po->pg_vec[i]); 2297 if (rb->pg_vec == NULL)
1935 int pg_num; 2298 continue;
1936 2299
1937 for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) { 2300 for (i = 0; i < rb->pg_vec_len; i++) {
1938 err = vm_insert_page(vma, start, page); 2301 struct page *page = virt_to_page(rb->pg_vec[i]);
1939 if (unlikely(err)) 2302 int pg_num;
1940 goto out; 2303
1941 start += PAGE_SIZE; 2304 for (pg_num = 0; pg_num < rb->pg_vec_pages;
2305 pg_num++,page++) {
2306 err = vm_insert_page(vma, start, page);
2307 if (unlikely(err))
2308 goto out;
2309 start += PAGE_SIZE;
2310 }
1942 } 2311 }
1943 } 2312 }
2313
1944 atomic_inc(&po->mapped); 2314 atomic_inc(&po->mapped);
1945 vma->vm_ops = &packet_mmap_ops; 2315 vma->vm_ops = &packet_mmap_ops;
1946 err = 0; 2316 err = 0;