aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/net/inet_sock.h23
-rw-r--r--net/ipv4/ip_output.c238
2 files changed, 154 insertions, 107 deletions
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 6e6dfd757682..7a37369f8ea3 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -86,6 +86,19 @@ static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk)
86 return (struct inet_request_sock *)sk; 86 return (struct inet_request_sock *)sk;
87} 87}
88 88
89struct inet_cork {
90 unsigned int flags;
91 unsigned int fragsize;
92 struct ip_options *opt;
93 struct dst_entry *dst;
94 int length; /* Total length of all frames */
95 __be32 addr;
96 struct flowi fl;
97 struct page *page;
98 u32 off;
99 u8 tx_flags;
100};
101
89struct ip_mc_socklist; 102struct ip_mc_socklist;
90struct ipv6_pinfo; 103struct ipv6_pinfo;
91struct rtable; 104struct rtable;
@@ -143,15 +156,7 @@ struct inet_sock {
143 int mc_index; 156 int mc_index;
144 __be32 mc_addr; 157 __be32 mc_addr;
145 struct ip_mc_socklist __rcu *mc_list; 158 struct ip_mc_socklist __rcu *mc_list;
146 struct { 159 struct inet_cork cork;
147 unsigned int flags;
148 unsigned int fragsize;
149 struct ip_options *opt;
150 struct dst_entry *dst;
151 int length; /* Total length of all frames */
152 __be32 addr;
153 struct flowi fl;
154 } cork;
155}; 160};
156 161
157#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */ 162#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index d3a4540cd308..1dd5ecc9a27e 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -733,6 +733,7 @@ csum_page(struct page *page, int offset, int copy)
733} 733}
734 734
735static inline int ip_ufo_append_data(struct sock *sk, 735static inline int ip_ufo_append_data(struct sock *sk,
736 struct sk_buff_head *queue,
736 int getfrag(void *from, char *to, int offset, int len, 737 int getfrag(void *from, char *to, int offset, int len,
737 int odd, struct sk_buff *skb), 738 int odd, struct sk_buff *skb),
738 void *from, int length, int hh_len, int fragheaderlen, 739 void *from, int length, int hh_len, int fragheaderlen,
@@ -745,7 +746,7 @@ static inline int ip_ufo_append_data(struct sock *sk,
745 * device, so create one single skb packet containing complete 746 * device, so create one single skb packet containing complete
746 * udp datagram 747 * udp datagram
747 */ 748 */
748 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) { 749 if ((skb = skb_peek_tail(queue)) == NULL) {
749 skb = sock_alloc_send_skb(sk, 750 skb = sock_alloc_send_skb(sk,
750 hh_len + fragheaderlen + transhdrlen + 20, 751 hh_len + fragheaderlen + transhdrlen + 20,
751 (flags & MSG_DONTWAIT), &err); 752 (flags & MSG_DONTWAIT), &err);
@@ -771,35 +772,24 @@ static inline int ip_ufo_append_data(struct sock *sk,
771 /* specify the length of each IP datagram fragment */ 772 /* specify the length of each IP datagram fragment */
772 skb_shinfo(skb)->gso_size = mtu - fragheaderlen; 773 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
773 skb_shinfo(skb)->gso_type = SKB_GSO_UDP; 774 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
774 __skb_queue_tail(&sk->sk_write_queue, skb); 775 __skb_queue_tail(queue, skb);
775 } 776 }
776 777
777 return skb_append_datato_frags(sk, skb, getfrag, from, 778 return skb_append_datato_frags(sk, skb, getfrag, from,
778 (length - transhdrlen)); 779 (length - transhdrlen));
779} 780}
780 781
781/* 782static int __ip_append_data(struct sock *sk, struct sk_buff_head *queue,
782 * ip_append_data() and ip_append_page() can make one large IP datagram 783 struct inet_cork *cork,
783 * from many pieces of data. Each pieces will be holded on the socket 784 int getfrag(void *from, char *to, int offset,
784 * until ip_push_pending_frames() is called. Each piece can be a page 785 int len, int odd, struct sk_buff *skb),
785 * or non-page data. 786 void *from, int length, int transhdrlen,
786 * 787 unsigned int flags)
787 * Not only UDP, other transport protocols - e.g. raw sockets - can use
788 * this interface potentially.
789 *
790 * LATER: length must be adjusted by pad at tail, when it is required.
791 */
792int ip_append_data(struct sock *sk,
793 int getfrag(void *from, char *to, int offset, int len,
794 int odd, struct sk_buff *skb),
795 void *from, int length, int transhdrlen,
796 struct ipcm_cookie *ipc, struct rtable **rtp,
797 unsigned int flags)
798{ 788{
799 struct inet_sock *inet = inet_sk(sk); 789 struct inet_sock *inet = inet_sk(sk);
800 struct sk_buff *skb; 790 struct sk_buff *skb;
801 791
802 struct ip_options *opt = NULL; 792 struct ip_options *opt = inet->cork.opt;
803 int hh_len; 793 int hh_len;
804 int exthdrlen; 794 int exthdrlen;
805 int mtu; 795 int mtu;
@@ -808,58 +798,19 @@ int ip_append_data(struct sock *sk,
808 int offset = 0; 798 int offset = 0;
809 unsigned int maxfraglen, fragheaderlen; 799 unsigned int maxfraglen, fragheaderlen;
810 int csummode = CHECKSUM_NONE; 800 int csummode = CHECKSUM_NONE;
811 struct rtable *rt; 801 struct rtable *rt = (struct rtable *)cork->dst;
812
813 if (flags&MSG_PROBE)
814 return 0;
815 802
816 if (skb_queue_empty(&sk->sk_write_queue)) { 803 exthdrlen = transhdrlen ? rt->dst.header_len : 0;
817 /* 804 length += exthdrlen;
818 * setup for corking. 805 transhdrlen += exthdrlen;
819 */ 806 mtu = inet->cork.fragsize;
820 opt = ipc->opt;
821 if (opt) {
822 if (inet->cork.opt == NULL) {
823 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
824 if (unlikely(inet->cork.opt == NULL))
825 return -ENOBUFS;
826 }
827 memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
828 inet->cork.flags |= IPCORK_OPT;
829 inet->cork.addr = ipc->addr;
830 }
831 rt = *rtp;
832 if (unlikely(!rt))
833 return -EFAULT;
834 /*
835 * We steal reference to this route, caller should not release it
836 */
837 *rtp = NULL;
838 inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
839 rt->dst.dev->mtu :
840 dst_mtu(rt->dst.path);
841 inet->cork.dst = &rt->dst;
842 inet->cork.length = 0;
843 sk->sk_sndmsg_page = NULL;
844 sk->sk_sndmsg_off = 0;
845 exthdrlen = rt->dst.header_len;
846 length += exthdrlen;
847 transhdrlen += exthdrlen;
848 } else {
849 rt = (struct rtable *)inet->cork.dst;
850 if (inet->cork.flags & IPCORK_OPT)
851 opt = inet->cork.opt;
852 807
853 transhdrlen = 0;
854 exthdrlen = 0;
855 mtu = inet->cork.fragsize;
856 }
857 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 808 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
858 809
859 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); 810 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
860 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; 811 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
861 812
862 if (inet->cork.length + length > 0xFFFF - fragheaderlen) { 813 if (cork->length + length > 0xFFFF - fragheaderlen) {
863 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, 814 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
864 mtu-exthdrlen); 815 mtu-exthdrlen);
865 return -EMSGSIZE; 816 return -EMSGSIZE;
@@ -875,15 +826,15 @@ int ip_append_data(struct sock *sk,
875 !exthdrlen) 826 !exthdrlen)
876 csummode = CHECKSUM_PARTIAL; 827 csummode = CHECKSUM_PARTIAL;
877 828
878 skb = skb_peek_tail(&sk->sk_write_queue); 829 skb = skb_peek_tail(queue);
879 830
880 inet->cork.length += length; 831 cork->length += length;
881 if (((length > mtu) || (skb && skb_is_gso(skb))) && 832 if (((length > mtu) || (skb && skb_is_gso(skb))) &&
882 (sk->sk_protocol == IPPROTO_UDP) && 833 (sk->sk_protocol == IPPROTO_UDP) &&
883 (rt->dst.dev->features & NETIF_F_UFO)) { 834 (rt->dst.dev->features & NETIF_F_UFO)) {
884 err = ip_ufo_append_data(sk, getfrag, from, length, hh_len, 835 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
885 fragheaderlen, transhdrlen, mtu, 836 hh_len, fragheaderlen, transhdrlen,
886 flags); 837 mtu, flags);
887 if (err) 838 if (err)
888 goto error; 839 goto error;
889 return 0; 840 return 0;
@@ -960,7 +911,7 @@ alloc_new_skb:
960 else 911 else
961 /* only the initial fragment is 912 /* only the initial fragment is
962 time stamped */ 913 time stamped */
963 ipc->tx_flags = 0; 914 cork->tx_flags = 0;
964 } 915 }
965 if (skb == NULL) 916 if (skb == NULL)
966 goto error; 917 goto error;
@@ -971,7 +922,7 @@ alloc_new_skb:
971 skb->ip_summed = csummode; 922 skb->ip_summed = csummode;
972 skb->csum = 0; 923 skb->csum = 0;
973 skb_reserve(skb, hh_len); 924 skb_reserve(skb, hh_len);
974 skb_shinfo(skb)->tx_flags = ipc->tx_flags; 925 skb_shinfo(skb)->tx_flags = cork->tx_flags;
975 926
976 /* 927 /*
977 * Find where to start putting bytes. 928 * Find where to start putting bytes.
@@ -1008,7 +959,7 @@ alloc_new_skb:
1008 /* 959 /*
1009 * Put the packet on the pending queue. 960 * Put the packet on the pending queue.
1010 */ 961 */
1011 __skb_queue_tail(&sk->sk_write_queue, skb); 962 __skb_queue_tail(queue, skb);
1012 continue; 963 continue;
1013 } 964 }
1014 965
@@ -1028,8 +979,8 @@ alloc_new_skb:
1028 } else { 979 } else {
1029 int i = skb_shinfo(skb)->nr_frags; 980 int i = skb_shinfo(skb)->nr_frags;
1030 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; 981 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1031 struct page *page = sk->sk_sndmsg_page; 982 struct page *page = cork->page;
1032 int off = sk->sk_sndmsg_off; 983 int off = cork->off;
1033 unsigned int left; 984 unsigned int left;
1034 985
1035 if (page && (left = PAGE_SIZE - off) > 0) { 986 if (page && (left = PAGE_SIZE - off) > 0) {
@@ -1041,7 +992,7 @@ alloc_new_skb:
1041 goto error; 992 goto error;
1042 } 993 }
1043 get_page(page); 994 get_page(page);
1044 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0); 995 skb_fill_page_desc(skb, i, page, off, 0);
1045 frag = &skb_shinfo(skb)->frags[i]; 996 frag = &skb_shinfo(skb)->frags[i];
1046 } 997 }
1047 } else if (i < MAX_SKB_FRAGS) { 998 } else if (i < MAX_SKB_FRAGS) {
@@ -1052,8 +1003,8 @@ alloc_new_skb:
1052 err = -ENOMEM; 1003 err = -ENOMEM;
1053 goto error; 1004 goto error;
1054 } 1005 }
1055 sk->sk_sndmsg_page = page; 1006 cork->page = page;
1056 sk->sk_sndmsg_off = 0; 1007 cork->off = 0;
1057 1008
1058 skb_fill_page_desc(skb, i, page, 0, 0); 1009 skb_fill_page_desc(skb, i, page, 0, 0);
1059 frag = &skb_shinfo(skb)->frags[i]; 1010 frag = &skb_shinfo(skb)->frags[i];
@@ -1065,7 +1016,7 @@ alloc_new_skb:
1065 err = -EFAULT; 1016 err = -EFAULT;
1066 goto error; 1017 goto error;
1067 } 1018 }
1068 sk->sk_sndmsg_off += copy; 1019 cork->off += copy;
1069 frag->size += copy; 1020 frag->size += copy;
1070 skb->len += copy; 1021 skb->len += copy;
1071 skb->data_len += copy; 1022 skb->data_len += copy;
@@ -1079,11 +1030,87 @@ alloc_new_skb:
1079 return 0; 1030 return 0;
1080 1031
1081error: 1032error:
1082 inet->cork.length -= length; 1033 cork->length -= length;
1083 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); 1034 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1084 return err; 1035 return err;
1085} 1036}
1086 1037
1038static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1039 struct ipcm_cookie *ipc, struct rtable **rtp)
1040{
1041 struct inet_sock *inet = inet_sk(sk);
1042 struct ip_options *opt;
1043 struct rtable *rt;
1044
1045 /*
1046 * setup for corking.
1047 */
1048 opt = ipc->opt;
1049 if (opt) {
1050 if (cork->opt == NULL) {
1051 cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1052 sk->sk_allocation);
1053 if (unlikely(cork->opt == NULL))
1054 return -ENOBUFS;
1055 }
1056 memcpy(cork->opt, opt, sizeof(struct ip_options) + opt->optlen);
1057 cork->flags |= IPCORK_OPT;
1058 cork->addr = ipc->addr;
1059 }
1060 rt = *rtp;
1061 if (unlikely(!rt))
1062 return -EFAULT;
1063 /*
1064 * We steal reference to this route, caller should not release it
1065 */
1066 *rtp = NULL;
1067 cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1068 rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1069 cork->dst = &rt->dst;
1070 cork->length = 0;
1071 cork->tx_flags = ipc->tx_flags;
1072 cork->page = NULL;
1073 cork->off = 0;
1074
1075 return 0;
1076}
1077
1078/*
1079 * ip_append_data() and ip_append_page() can make one large IP datagram
1080 * from many pieces of data. Each pieces will be holded on the socket
1081 * until ip_push_pending_frames() is called. Each piece can be a page
1082 * or non-page data.
1083 *
1084 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1085 * this interface potentially.
1086 *
1087 * LATER: length must be adjusted by pad at tail, when it is required.
1088 */
1089int ip_append_data(struct sock *sk,
1090 int getfrag(void *from, char *to, int offset, int len,
1091 int odd, struct sk_buff *skb),
1092 void *from, int length, int transhdrlen,
1093 struct ipcm_cookie *ipc, struct rtable **rtp,
1094 unsigned int flags)
1095{
1096 struct inet_sock *inet = inet_sk(sk);
1097 int err;
1098
1099 if (flags&MSG_PROBE)
1100 return 0;
1101
1102 if (skb_queue_empty(&sk->sk_write_queue)) {
1103 err = ip_setup_cork(sk, &inet->cork, ipc, rtp);
1104 if (err)
1105 return err;
1106 } else {
1107 transhdrlen = 0;
1108 }
1109
1110 return __ip_append_data(sk, &sk->sk_write_queue, &inet->cork, getfrag,
1111 from, length, transhdrlen, flags);
1112}
1113
1087ssize_t ip_append_page(struct sock *sk, struct page *page, 1114ssize_t ip_append_page(struct sock *sk, struct page *page,
1088 int offset, size_t size, int flags) 1115 int offset, size_t size, int flags)
1089{ 1116{
@@ -1227,40 +1254,42 @@ error:
1227 return err; 1254 return err;
1228} 1255}
1229 1256
1230static void ip_cork_release(struct inet_sock *inet) 1257static void ip_cork_release(struct inet_cork *cork)
1231{ 1258{
1232 inet->cork.flags &= ~IPCORK_OPT; 1259 cork->flags &= ~IPCORK_OPT;
1233 kfree(inet->cork.opt); 1260 kfree(cork->opt);
1234 inet->cork.opt = NULL; 1261 cork->opt = NULL;
1235 dst_release(inet->cork.dst); 1262 dst_release(cork->dst);
1236 inet->cork.dst = NULL; 1263 cork->dst = NULL;
1237} 1264}
1238 1265
1239/* 1266/*
1240 * Combined all pending IP fragments on the socket as one IP datagram 1267 * Combined all pending IP fragments on the socket as one IP datagram
1241 * and push them out. 1268 * and push them out.
1242 */ 1269 */
1243int ip_push_pending_frames(struct sock *sk) 1270static int __ip_push_pending_frames(struct sock *sk,
1271 struct sk_buff_head *queue,
1272 struct inet_cork *cork)
1244{ 1273{
1245 struct sk_buff *skb, *tmp_skb; 1274 struct sk_buff *skb, *tmp_skb;
1246 struct sk_buff **tail_skb; 1275 struct sk_buff **tail_skb;
1247 struct inet_sock *inet = inet_sk(sk); 1276 struct inet_sock *inet = inet_sk(sk);
1248 struct net *net = sock_net(sk); 1277 struct net *net = sock_net(sk);
1249 struct ip_options *opt = NULL; 1278 struct ip_options *opt = NULL;
1250 struct rtable *rt = (struct rtable *)inet->cork.dst; 1279 struct rtable *rt = (struct rtable *)cork->dst;
1251 struct iphdr *iph; 1280 struct iphdr *iph;
1252 __be16 df = 0; 1281 __be16 df = 0;
1253 __u8 ttl; 1282 __u8 ttl;
1254 int err = 0; 1283 int err = 0;
1255 1284
1256 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL) 1285 if ((skb = __skb_dequeue(queue)) == NULL)
1257 goto out; 1286 goto out;
1258 tail_skb = &(skb_shinfo(skb)->frag_list); 1287 tail_skb = &(skb_shinfo(skb)->frag_list);
1259 1288
1260 /* move skb->data to ip header from ext header */ 1289 /* move skb->data to ip header from ext header */
1261 if (skb->data < skb_network_header(skb)) 1290 if (skb->data < skb_network_header(skb))
1262 __skb_pull(skb, skb_network_offset(skb)); 1291 __skb_pull(skb, skb_network_offset(skb));
1263 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { 1292 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1264 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1293 __skb_pull(tmp_skb, skb_network_header_len(skb));
1265 *tail_skb = tmp_skb; 1294 *tail_skb = tmp_skb;
1266 tail_skb = &(tmp_skb->next); 1295 tail_skb = &(tmp_skb->next);
@@ -1286,8 +1315,8 @@ int ip_push_pending_frames(struct sock *sk)
1286 ip_dont_fragment(sk, &rt->dst))) 1315 ip_dont_fragment(sk, &rt->dst)))
1287 df = htons(IP_DF); 1316 df = htons(IP_DF);
1288 1317
1289 if (inet->cork.flags & IPCORK_OPT) 1318 if (cork->flags & IPCORK_OPT)
1290 opt = inet->cork.opt; 1319 opt = cork->opt;
1291 1320
1292 if (rt->rt_type == RTN_MULTICAST) 1321 if (rt->rt_type == RTN_MULTICAST)
1293 ttl = inet->mc_ttl; 1322 ttl = inet->mc_ttl;
@@ -1299,7 +1328,7 @@ int ip_push_pending_frames(struct sock *sk)
1299 iph->ihl = 5; 1328 iph->ihl = 5;
1300 if (opt) { 1329 if (opt) {
1301 iph->ihl += opt->optlen>>2; 1330 iph->ihl += opt->optlen>>2;
1302 ip_options_build(skb, opt, inet->cork.addr, rt, 0); 1331 ip_options_build(skb, opt, cork->addr, rt, 0);
1303 } 1332 }
1304 iph->tos = inet->tos; 1333 iph->tos = inet->tos;
1305 iph->frag_off = df; 1334 iph->frag_off = df;
@@ -1315,7 +1344,7 @@ int ip_push_pending_frames(struct sock *sk)
1315 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec 1344 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1316 * on dst refcount 1345 * on dst refcount
1317 */ 1346 */
1318 inet->cork.dst = NULL; 1347 cork->dst = NULL;
1319 skb_dst_set(skb, &rt->dst); 1348 skb_dst_set(skb, &rt->dst);
1320 1349
1321 if (iph->protocol == IPPROTO_ICMP) 1350 if (iph->protocol == IPPROTO_ICMP)
@@ -1332,7 +1361,7 @@ int ip_push_pending_frames(struct sock *sk)
1332 } 1361 }
1333 1362
1334out: 1363out:
1335 ip_cork_release(inet); 1364 ip_cork_release(cork);
1336 return err; 1365 return err;
1337 1366
1338error: 1367error:
@@ -1340,17 +1369,30 @@ error:
1340 goto out; 1369 goto out;
1341} 1370}
1342 1371
1372int ip_push_pending_frames(struct sock *sk)
1373{
1374 return __ip_push_pending_frames(sk, &sk->sk_write_queue,
1375 &inet_sk(sk)->cork);
1376}
1377
1343/* 1378/*
1344 * Throw away all pending data on the socket. 1379 * Throw away all pending data on the socket.
1345 */ 1380 */
1346void ip_flush_pending_frames(struct sock *sk) 1381static void __ip_flush_pending_frames(struct sock *sk,
1382 struct sk_buff_head *queue,
1383 struct inet_cork *cork)
1347{ 1384{
1348 struct sk_buff *skb; 1385 struct sk_buff *skb;
1349 1386
1350 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) 1387 while ((skb = __skb_dequeue_tail(queue)) != NULL)
1351 kfree_skb(skb); 1388 kfree_skb(skb);
1352 1389
1353 ip_cork_release(inet_sk(sk)); 1390 ip_cork_release(cork);
1391}
1392
1393void ip_flush_pending_frames(struct sock *sk)
1394{
1395 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork);
1354} 1396}
1355 1397
1356 1398