aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/ip_output.c
diff options
context:
space:
mode:
authorGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
committerGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
commitc71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
treeecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /net/ipv4/ip_output.c
parentea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts: litmus/sched_cedf.c
Diffstat (limited to 'net/ipv4/ip_output.c')
-rw-r--r--net/ipv4/ip_output.c474
1 files changed, 274 insertions, 200 deletions
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 7649d7750075..84f26e8e6c60 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -82,6 +82,7 @@
82#include <linux/tcp.h> 82#include <linux/tcp.h>
83 83
84int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; 84int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
85EXPORT_SYMBOL(sysctl_ip_default_ttl);
85 86
86/* Generate a checksum for an outgoing IP datagram. */ 87/* Generate a checksum for an outgoing IP datagram. */
87__inline__ void ip_send_check(struct iphdr *iph) 88__inline__ void ip_send_check(struct iphdr *iph)
@@ -130,7 +131,7 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
130 int ttl = inet->uc_ttl; 131 int ttl = inet->uc_ttl;
131 132
132 if (ttl < 0) 133 if (ttl < 0)
133 ttl = dst_metric(dst, RTAX_HOPLIMIT); 134 ttl = ip4_dst_hoplimit(dst);
134 return ttl; 135 return ttl;
135} 136}
136 137
@@ -139,14 +140,14 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
139 * 140 *
140 */ 141 */
141int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, 142int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
142 __be32 saddr, __be32 daddr, struct ip_options *opt) 143 __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
143{ 144{
144 struct inet_sock *inet = inet_sk(sk); 145 struct inet_sock *inet = inet_sk(sk);
145 struct rtable *rt = skb_rtable(skb); 146 struct rtable *rt = skb_rtable(skb);
146 struct iphdr *iph; 147 struct iphdr *iph;
147 148
148 /* Build the IP header. */ 149 /* Build the IP header. */
149 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); 150 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
150 skb_reset_network_header(skb); 151 skb_reset_network_header(skb);
151 iph = ip_hdr(skb); 152 iph = ip_hdr(skb);
152 iph->version = 4; 153 iph->version = 4;
@@ -157,14 +158,14 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
157 else 158 else
158 iph->frag_off = 0; 159 iph->frag_off = 0;
159 iph->ttl = ip_select_ttl(inet, &rt->dst); 160 iph->ttl = ip_select_ttl(inet, &rt->dst);
160 iph->daddr = rt->rt_dst; 161 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
161 iph->saddr = rt->rt_src; 162 iph->saddr = saddr;
162 iph->protocol = sk->sk_protocol; 163 iph->protocol = sk->sk_protocol;
163 ip_select_ident(iph, &rt->dst, sk); 164 ip_select_ident(iph, &rt->dst, sk);
164 165
165 if (opt && opt->optlen) { 166 if (opt && opt->opt.optlen) {
166 iph->ihl += opt->optlen>>2; 167 iph->ihl += opt->opt.optlen>>2;
167 ip_options_build(skb, opt, daddr, rt, 0); 168 ip_options_build(skb, &opt->opt, daddr, rt, 0);
168 } 169 }
169 170
170 skb->priority = sk->sk_priority; 171 skb->priority = sk->sk_priority;
@@ -311,11 +312,12 @@ int ip_output(struct sk_buff *skb)
311 !(IPCB(skb)->flags & IPSKB_REROUTED)); 312 !(IPCB(skb)->flags & IPSKB_REROUTED));
312} 313}
313 314
314int ip_queue_xmit(struct sk_buff *skb) 315int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
315{ 316{
316 struct sock *sk = skb->sk; 317 struct sock *sk = skb->sk;
317 struct inet_sock *inet = inet_sk(sk); 318 struct inet_sock *inet = inet_sk(sk);
318 struct ip_options *opt = inet->opt; 319 struct ip_options_rcu *inet_opt;
320 struct flowi4 *fl4;
319 struct rtable *rt; 321 struct rtable *rt;
320 struct iphdr *iph; 322 struct iphdr *iph;
321 int res; 323 int res;
@@ -324,6 +326,8 @@ int ip_queue_xmit(struct sk_buff *skb)
324 * f.e. by something like SCTP. 326 * f.e. by something like SCTP.
325 */ 327 */
326 rcu_read_lock(); 328 rcu_read_lock();
329 inet_opt = rcu_dereference(inet->inet_opt);
330 fl4 = &fl->u.ip4;
327 rt = skb_rtable(skb); 331 rt = skb_rtable(skb);
328 if (rt != NULL) 332 if (rt != NULL)
329 goto packet_routed; 333 goto packet_routed;
@@ -335,40 +339,32 @@ int ip_queue_xmit(struct sk_buff *skb)
335 339
336 /* Use correct destination address if we have options. */ 340 /* Use correct destination address if we have options. */
337 daddr = inet->inet_daddr; 341 daddr = inet->inet_daddr;
338 if(opt && opt->srr) 342 if (inet_opt && inet_opt->opt.srr)
339 daddr = opt->faddr; 343 daddr = inet_opt->opt.faddr;
340 344
341 { 345 /* If this fails, retransmit mechanism of transport layer will
342 struct flowi fl = { .oif = sk->sk_bound_dev_if, 346 * keep trying until route appears or the connection times
343 .mark = sk->sk_mark, 347 * itself out.
344 .nl_u = { .ip4_u = 348 */
345 { .daddr = daddr, 349 rt = ip_route_output_ports(sock_net(sk), fl4, sk,
346 .saddr = inet->inet_saddr, 350 daddr, inet->inet_saddr,
347 .tos = RT_CONN_FLAGS(sk) } }, 351 inet->inet_dport,
348 .proto = sk->sk_protocol, 352 inet->inet_sport,
349 .flags = inet_sk_flowi_flags(sk), 353 sk->sk_protocol,
350 .uli_u = { .ports = 354 RT_CONN_FLAGS(sk),
351 { .sport = inet->inet_sport, 355 sk->sk_bound_dev_if);
352 .dport = inet->inet_dport } } }; 356 if (IS_ERR(rt))
353 357 goto no_route;
354 /* If this fails, retransmit mechanism of transport layer will
355 * keep trying until route appears or the connection times
356 * itself out.
357 */
358 security_sk_classify_flow(sk, &fl);
359 if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
360 goto no_route;
361 }
362 sk_setup_caps(sk, &rt->dst); 358 sk_setup_caps(sk, &rt->dst);
363 } 359 }
364 skb_dst_set_noref(skb, &rt->dst); 360 skb_dst_set_noref(skb, &rt->dst);
365 361
366packet_routed: 362packet_routed:
367 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) 363 if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
368 goto no_route; 364 goto no_route;
369 365
370 /* OK, we know where to send it, allocate and build IP header. */ 366 /* OK, we know where to send it, allocate and build IP header. */
371 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); 367 skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
372 skb_reset_network_header(skb); 368 skb_reset_network_header(skb);
373 iph = ip_hdr(skb); 369 iph = ip_hdr(skb);
374 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); 370 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
@@ -378,13 +374,13 @@ packet_routed:
378 iph->frag_off = 0; 374 iph->frag_off = 0;
379 iph->ttl = ip_select_ttl(inet, &rt->dst); 375 iph->ttl = ip_select_ttl(inet, &rt->dst);
380 iph->protocol = sk->sk_protocol; 376 iph->protocol = sk->sk_protocol;
381 iph->saddr = rt->rt_src; 377 iph->saddr = fl4->saddr;
382 iph->daddr = rt->rt_dst; 378 iph->daddr = fl4->daddr;
383 /* Transport layer set skb->h.foo itself. */ 379 /* Transport layer set skb->h.foo itself. */
384 380
385 if (opt && opt->optlen) { 381 if (inet_opt && inet_opt->opt.optlen) {
386 iph->ihl += opt->optlen >> 2; 382 iph->ihl += inet_opt->opt.optlen >> 2;
387 ip_options_build(skb, opt, inet->inet_daddr, rt, 0); 383 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
388 } 384 }
389 385
390 ip_select_ident_more(iph, &rt->dst, sk, 386 ip_select_ident_more(iph, &rt->dst, sk,
@@ -487,7 +483,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
487 * LATER: this step can be merged to real generation of fragments, 483 * LATER: this step can be merged to real generation of fragments,
488 * we can switch to copy when see the first bad fragment. 484 * we can switch to copy when see the first bad fragment.
489 */ 485 */
490 if (skb_has_frags(skb)) { 486 if (skb_has_frag_list(skb)) {
491 struct sk_buff *frag, *frag2; 487 struct sk_buff *frag, *frag2;
492 int first_len = skb_pagelen(skb); 488 int first_len = skb_pagelen(skb);
493 489
@@ -610,7 +606,7 @@ slow_path:
610 /* IF: it doesn't fit, use 'mtu' - the data space left */ 606 /* IF: it doesn't fit, use 'mtu' - the data space left */
611 if (len > mtu) 607 if (len > mtu)
612 len = mtu; 608 len = mtu;
613 /* IF: we are not sending upto and including the packet end 609 /* IF: we are not sending up to and including the packet end
614 then align the next start on an eight byte boundary */ 610 then align the next start on an eight byte boundary */
615 if (len < left) { 611 if (len < left) {
616 len &= ~7; 612 len &= ~7;
@@ -734,6 +730,7 @@ csum_page(struct page *page, int offset, int copy)
734} 730}
735 731
736static inline int ip_ufo_append_data(struct sock *sk, 732static inline int ip_ufo_append_data(struct sock *sk,
733 struct sk_buff_head *queue,
737 int getfrag(void *from, char *to, int offset, int len, 734 int getfrag(void *from, char *to, int offset, int len,
738 int odd, struct sk_buff *skb), 735 int odd, struct sk_buff *skb),
739 void *from, int length, int hh_len, int fragheaderlen, 736 void *from, int length, int hh_len, int fragheaderlen,
@@ -746,7 +743,7 @@ static inline int ip_ufo_append_data(struct sock *sk,
746 * device, so create one single skb packet containing complete 743 * device, so create one single skb packet containing complete
747 * udp datagram 744 * udp datagram
748 */ 745 */
749 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) { 746 if ((skb = skb_peek_tail(queue)) == NULL) {
750 skb = sock_alloc_send_skb(sk, 747 skb = sock_alloc_send_skb(sk,
751 hh_len + fragheaderlen + transhdrlen + 20, 748 hh_len + fragheaderlen + transhdrlen + 20,
752 (flags & MSG_DONTWAIT), &err); 749 (flags & MSG_DONTWAIT), &err);
@@ -768,40 +765,30 @@ static inline int ip_ufo_append_data(struct sock *sk,
768 765
769 skb->ip_summed = CHECKSUM_PARTIAL; 766 skb->ip_summed = CHECKSUM_PARTIAL;
770 skb->csum = 0; 767 skb->csum = 0;
771 sk->sk_sndmsg_off = 0;
772 768
773 /* specify the length of each IP datagram fragment */ 769 /* specify the length of each IP datagram fragment */
774 skb_shinfo(skb)->gso_size = mtu - fragheaderlen; 770 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
775 skb_shinfo(skb)->gso_type = SKB_GSO_UDP; 771 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
776 __skb_queue_tail(&sk->sk_write_queue, skb); 772 __skb_queue_tail(queue, skb);
777 } 773 }
778 774
779 return skb_append_datato_frags(sk, skb, getfrag, from, 775 return skb_append_datato_frags(sk, skb, getfrag, from,
780 (length - transhdrlen)); 776 (length - transhdrlen));
781} 777}
782 778
783/* 779static int __ip_append_data(struct sock *sk,
784 * ip_append_data() and ip_append_page() can make one large IP datagram 780 struct flowi4 *fl4,
785 * from many pieces of data. Each pieces will be holded on the socket 781 struct sk_buff_head *queue,
786 * until ip_push_pending_frames() is called. Each piece can be a page 782 struct inet_cork *cork,
787 * or non-page data. 783 int getfrag(void *from, char *to, int offset,
788 * 784 int len, int odd, struct sk_buff *skb),
789 * Not only UDP, other transport protocols - e.g. raw sockets - can use 785 void *from, int length, int transhdrlen,
790 * this interface potentially. 786 unsigned int flags)
791 *
792 * LATER: length must be adjusted by pad at tail, when it is required.
793 */
794int ip_append_data(struct sock *sk,
795 int getfrag(void *from, char *to, int offset, int len,
796 int odd, struct sk_buff *skb),
797 void *from, int length, int transhdrlen,
798 struct ipcm_cookie *ipc, struct rtable **rtp,
799 unsigned int flags)
800{ 787{
801 struct inet_sock *inet = inet_sk(sk); 788 struct inet_sock *inet = inet_sk(sk);
802 struct sk_buff *skb; 789 struct sk_buff *skb;
803 790
804 struct ip_options *opt = NULL; 791 struct ip_options *opt = cork->opt;
805 int hh_len; 792 int hh_len;
806 int exthdrlen; 793 int exthdrlen;
807 int mtu; 794 int mtu;
@@ -810,60 +797,20 @@ int ip_append_data(struct sock *sk,
810 int offset = 0; 797 int offset = 0;
811 unsigned int maxfraglen, fragheaderlen; 798 unsigned int maxfraglen, fragheaderlen;
812 int csummode = CHECKSUM_NONE; 799 int csummode = CHECKSUM_NONE;
813 struct rtable *rt; 800 struct rtable *rt = (struct rtable *)cork->dst;
814 801
815 if (flags&MSG_PROBE) 802 skb = skb_peek_tail(queue);
816 return 0;
817 803
818 if (skb_queue_empty(&sk->sk_write_queue)) { 804 exthdrlen = !skb ? rt->dst.header_len : 0;
819 /* 805 mtu = cork->fragsize;
820 * setup for corking.
821 */
822 opt = ipc->opt;
823 if (opt) {
824 if (inet->cork.opt == NULL) {
825 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
826 if (unlikely(inet->cork.opt == NULL))
827 return -ENOBUFS;
828 }
829 memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
830 inet->cork.flags |= IPCORK_OPT;
831 inet->cork.addr = ipc->addr;
832 }
833 rt = *rtp;
834 if (unlikely(!rt))
835 return -EFAULT;
836 /*
837 * We steal reference to this route, caller should not release it
838 */
839 *rtp = NULL;
840 inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
841 rt->dst.dev->mtu :
842 dst_mtu(rt->dst.path);
843 inet->cork.dst = &rt->dst;
844 inet->cork.length = 0;
845 sk->sk_sndmsg_page = NULL;
846 sk->sk_sndmsg_off = 0;
847 if ((exthdrlen = rt->dst.header_len) != 0) {
848 length += exthdrlen;
849 transhdrlen += exthdrlen;
850 }
851 } else {
852 rt = (struct rtable *)inet->cork.dst;
853 if (inet->cork.flags & IPCORK_OPT)
854 opt = inet->cork.opt;
855 806
856 transhdrlen = 0;
857 exthdrlen = 0;
858 mtu = inet->cork.fragsize;
859 }
860 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 807 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
861 808
862 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); 809 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
863 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; 810 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
864 811
865 if (inet->cork.length + length > 0xFFFF - fragheaderlen) { 812 if (cork->length + length > 0xFFFF - fragheaderlen) {
866 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, 813 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
867 mtu-exthdrlen); 814 mtu-exthdrlen);
868 return -EMSGSIZE; 815 return -EMSGSIZE;
869 } 816 }
@@ -878,15 +825,13 @@ int ip_append_data(struct sock *sk,
878 !exthdrlen) 825 !exthdrlen)
879 csummode = CHECKSUM_PARTIAL; 826 csummode = CHECKSUM_PARTIAL;
880 827
881 skb = skb_peek_tail(&sk->sk_write_queue); 828 cork->length += length;
882
883 inet->cork.length += length;
884 if (((length > mtu) || (skb && skb_is_gso(skb))) && 829 if (((length > mtu) || (skb && skb_is_gso(skb))) &&
885 (sk->sk_protocol == IPPROTO_UDP) && 830 (sk->sk_protocol == IPPROTO_UDP) &&
886 (rt->dst.dev->features & NETIF_F_UFO)) { 831 (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
887 err = ip_ufo_append_data(sk, getfrag, from, length, hh_len, 832 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
888 fragheaderlen, transhdrlen, mtu, 833 hh_len, fragheaderlen, transhdrlen,
889 flags); 834 mtu, flags);
890 if (err) 835 if (err)
891 goto error; 836 goto error;
892 return 0; 837 return 0;
@@ -934,7 +879,9 @@ alloc_new_skb:
934 !(rt->dst.dev->features&NETIF_F_SG)) 879 !(rt->dst.dev->features&NETIF_F_SG))
935 alloclen = mtu; 880 alloclen = mtu;
936 else 881 else
937 alloclen = datalen + fragheaderlen; 882 alloclen = fraglen;
883
884 alloclen += exthdrlen;
938 885
939 /* The last fragment gets additional space at tail. 886 /* The last fragment gets additional space at tail.
940 * Note, with MSG_MORE we overallocate on fragments, 887 * Note, with MSG_MORE we overallocate on fragments,
@@ -960,7 +907,7 @@ alloc_new_skb:
960 else 907 else
961 /* only the initial fragment is 908 /* only the initial fragment is
962 time stamped */ 909 time stamped */
963 ipc->shtx.flags = 0; 910 cork->tx_flags = 0;
964 } 911 }
965 if (skb == NULL) 912 if (skb == NULL)
966 goto error; 913 goto error;
@@ -971,16 +918,16 @@ alloc_new_skb:
971 skb->ip_summed = csummode; 918 skb->ip_summed = csummode;
972 skb->csum = 0; 919 skb->csum = 0;
973 skb_reserve(skb, hh_len); 920 skb_reserve(skb, hh_len);
974 *skb_tx(skb) = ipc->shtx; 921 skb_shinfo(skb)->tx_flags = cork->tx_flags;
975 922
976 /* 923 /*
977 * Find where to start putting bytes. 924 * Find where to start putting bytes.
978 */ 925 */
979 data = skb_put(skb, fraglen); 926 data = skb_put(skb, fraglen + exthdrlen);
980 skb_set_network_header(skb, exthdrlen); 927 skb_set_network_header(skb, exthdrlen);
981 skb->transport_header = (skb->network_header + 928 skb->transport_header = (skb->network_header +
982 fragheaderlen); 929 fragheaderlen);
983 data += fragheaderlen; 930 data += fragheaderlen + exthdrlen;
984 931
985 if (fraggap) { 932 if (fraggap) {
986 skb->csum = skb_copy_and_csum_bits( 933 skb->csum = skb_copy_and_csum_bits(
@@ -1008,7 +955,7 @@ alloc_new_skb:
1008 /* 955 /*
1009 * Put the packet on the pending queue. 956 * Put the packet on the pending queue.
1010 */ 957 */
1011 __skb_queue_tail(&sk->sk_write_queue, skb); 958 __skb_queue_tail(queue, skb);
1012 continue; 959 continue;
1013 } 960 }
1014 961
@@ -1028,8 +975,8 @@ alloc_new_skb:
1028 } else { 975 } else {
1029 int i = skb_shinfo(skb)->nr_frags; 976 int i = skb_shinfo(skb)->nr_frags;
1030 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; 977 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1031 struct page *page = sk->sk_sndmsg_page; 978 struct page *page = cork->page;
1032 int off = sk->sk_sndmsg_off; 979 int off = cork->off;
1033 unsigned int left; 980 unsigned int left;
1034 981
1035 if (page && (left = PAGE_SIZE - off) > 0) { 982 if (page && (left = PAGE_SIZE - off) > 0) {
@@ -1041,7 +988,7 @@ alloc_new_skb:
1041 goto error; 988 goto error;
1042 } 989 }
1043 get_page(page); 990 get_page(page);
1044 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0); 991 skb_fill_page_desc(skb, i, page, off, 0);
1045 frag = &skb_shinfo(skb)->frags[i]; 992 frag = &skb_shinfo(skb)->frags[i];
1046 } 993 }
1047 } else if (i < MAX_SKB_FRAGS) { 994 } else if (i < MAX_SKB_FRAGS) {
@@ -1052,8 +999,8 @@ alloc_new_skb:
1052 err = -ENOMEM; 999 err = -ENOMEM;
1053 goto error; 1000 goto error;
1054 } 1001 }
1055 sk->sk_sndmsg_page = page; 1002 cork->page = page;
1056 sk->sk_sndmsg_off = 0; 1003 cork->off = 0;
1057 1004
1058 skb_fill_page_desc(skb, i, page, 0, 0); 1005 skb_fill_page_desc(skb, i, page, 0, 0);
1059 frag = &skb_shinfo(skb)->frags[i]; 1006 frag = &skb_shinfo(skb)->frags[i];
@@ -1065,7 +1012,7 @@ alloc_new_skb:
1065 err = -EFAULT; 1012 err = -EFAULT;
1066 goto error; 1013 goto error;
1067 } 1014 }
1068 sk->sk_sndmsg_off += copy; 1015 cork->off += copy;
1069 frag->size += copy; 1016 frag->size += copy;
1070 skb->len += copy; 1017 skb->len += copy;
1071 skb->data_len += copy; 1018 skb->data_len += copy;
@@ -1079,18 +1026,95 @@ alloc_new_skb:
1079 return 0; 1026 return 0;
1080 1027
1081error: 1028error:
1082 inet->cork.length -= length; 1029 cork->length -= length;
1083 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); 1030 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1084 return err; 1031 return err;
1085} 1032}
1086 1033
1087ssize_t ip_append_page(struct sock *sk, struct page *page, 1034static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1035 struct ipcm_cookie *ipc, struct rtable **rtp)
1036{
1037 struct inet_sock *inet = inet_sk(sk);
1038 struct ip_options_rcu *opt;
1039 struct rtable *rt;
1040
1041 /*
1042 * setup for corking.
1043 */
1044 opt = ipc->opt;
1045 if (opt) {
1046 if (cork->opt == NULL) {
1047 cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1048 sk->sk_allocation);
1049 if (unlikely(cork->opt == NULL))
1050 return -ENOBUFS;
1051 }
1052 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1053 cork->flags |= IPCORK_OPT;
1054 cork->addr = ipc->addr;
1055 }
1056 rt = *rtp;
1057 if (unlikely(!rt))
1058 return -EFAULT;
1059 /*
1060 * We steal reference to this route, caller should not release it
1061 */
1062 *rtp = NULL;
1063 cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1064 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1065 cork->dst = &rt->dst;
1066 cork->length = 0;
1067 cork->tx_flags = ipc->tx_flags;
1068 cork->page = NULL;
1069 cork->off = 0;
1070
1071 return 0;
1072}
1073
1074/*
1075 * ip_append_data() and ip_append_page() can make one large IP datagram
1076 * from many pieces of data. Each pieces will be holded on the socket
1077 * until ip_push_pending_frames() is called. Each piece can be a page
1078 * or non-page data.
1079 *
1080 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1081 * this interface potentially.
1082 *
1083 * LATER: length must be adjusted by pad at tail, when it is required.
1084 */
1085int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1086 int getfrag(void *from, char *to, int offset, int len,
1087 int odd, struct sk_buff *skb),
1088 void *from, int length, int transhdrlen,
1089 struct ipcm_cookie *ipc, struct rtable **rtp,
1090 unsigned int flags)
1091{
1092 struct inet_sock *inet = inet_sk(sk);
1093 int err;
1094
1095 if (flags&MSG_PROBE)
1096 return 0;
1097
1098 if (skb_queue_empty(&sk->sk_write_queue)) {
1099 err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1100 if (err)
1101 return err;
1102 } else {
1103 transhdrlen = 0;
1104 }
1105
1106 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
1107 from, length, transhdrlen, flags);
1108}
1109
1110ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1088 int offset, size_t size, int flags) 1111 int offset, size_t size, int flags)
1089{ 1112{
1090 struct inet_sock *inet = inet_sk(sk); 1113 struct inet_sock *inet = inet_sk(sk);
1091 struct sk_buff *skb; 1114 struct sk_buff *skb;
1092 struct rtable *rt; 1115 struct rtable *rt;
1093 struct ip_options *opt = NULL; 1116 struct ip_options *opt = NULL;
1117 struct inet_cork *cork;
1094 int hh_len; 1118 int hh_len;
1095 int mtu; 1119 int mtu;
1096 int len; 1120 int len;
@@ -1106,28 +1130,29 @@ ssize_t ip_append_page(struct sock *sk, struct page *page,
1106 if (skb_queue_empty(&sk->sk_write_queue)) 1130 if (skb_queue_empty(&sk->sk_write_queue))
1107 return -EINVAL; 1131 return -EINVAL;
1108 1132
1109 rt = (struct rtable *)inet->cork.dst; 1133 cork = &inet->cork.base;
1110 if (inet->cork.flags & IPCORK_OPT) 1134 rt = (struct rtable *)cork->dst;
1111 opt = inet->cork.opt; 1135 if (cork->flags & IPCORK_OPT)
1136 opt = cork->opt;
1112 1137
1113 if (!(rt->dst.dev->features&NETIF_F_SG)) 1138 if (!(rt->dst.dev->features&NETIF_F_SG))
1114 return -EOPNOTSUPP; 1139 return -EOPNOTSUPP;
1115 1140
1116 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1141 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1117 mtu = inet->cork.fragsize; 1142 mtu = cork->fragsize;
1118 1143
1119 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); 1144 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1120 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; 1145 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1121 1146
1122 if (inet->cork.length + size > 0xFFFF - fragheaderlen) { 1147 if (cork->length + size > 0xFFFF - fragheaderlen) {
1123 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu); 1148 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
1124 return -EMSGSIZE; 1149 return -EMSGSIZE;
1125 } 1150 }
1126 1151
1127 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) 1152 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1128 return -EINVAL; 1153 return -EINVAL;
1129 1154
1130 inet->cork.length += size; 1155 cork->length += size;
1131 if ((size + skb->len > mtu) && 1156 if ((size + skb->len > mtu) &&
1132 (sk->sk_protocol == IPPROTO_UDP) && 1157 (sk->sk_protocol == IPPROTO_UDP) &&
1133 (rt->dst.dev->features & NETIF_F_UFO)) { 1158 (rt->dst.dev->features & NETIF_F_UFO)) {
@@ -1222,45 +1247,47 @@ ssize_t ip_append_page(struct sock *sk, struct page *page,
1222 return 0; 1247 return 0;
1223 1248
1224error: 1249error:
1225 inet->cork.length -= size; 1250 cork->length -= size;
1226 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); 1251 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1227 return err; 1252 return err;
1228} 1253}
1229 1254
1230static void ip_cork_release(struct inet_sock *inet) 1255static void ip_cork_release(struct inet_cork *cork)
1231{ 1256{
1232 inet->cork.flags &= ~IPCORK_OPT; 1257 cork->flags &= ~IPCORK_OPT;
1233 kfree(inet->cork.opt); 1258 kfree(cork->opt);
1234 inet->cork.opt = NULL; 1259 cork->opt = NULL;
1235 dst_release(inet->cork.dst); 1260 dst_release(cork->dst);
1236 inet->cork.dst = NULL; 1261 cork->dst = NULL;
1237} 1262}
1238 1263
1239/* 1264/*
1240 * Combined all pending IP fragments on the socket as one IP datagram 1265 * Combined all pending IP fragments on the socket as one IP datagram
1241 * and push them out. 1266 * and push them out.
1242 */ 1267 */
1243int ip_push_pending_frames(struct sock *sk) 1268struct sk_buff *__ip_make_skb(struct sock *sk,
1269 struct flowi4 *fl4,
1270 struct sk_buff_head *queue,
1271 struct inet_cork *cork)
1244{ 1272{
1245 struct sk_buff *skb, *tmp_skb; 1273 struct sk_buff *skb, *tmp_skb;
1246 struct sk_buff **tail_skb; 1274 struct sk_buff **tail_skb;
1247 struct inet_sock *inet = inet_sk(sk); 1275 struct inet_sock *inet = inet_sk(sk);
1248 struct net *net = sock_net(sk); 1276 struct net *net = sock_net(sk);
1249 struct ip_options *opt = NULL; 1277 struct ip_options *opt = NULL;
1250 struct rtable *rt = (struct rtable *)inet->cork.dst; 1278 struct rtable *rt = (struct rtable *)cork->dst;
1251 struct iphdr *iph; 1279 struct iphdr *iph;
1252 __be16 df = 0; 1280 __be16 df = 0;
1253 __u8 ttl; 1281 __u8 ttl;
1254 int err = 0;
1255 1282
1256 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL) 1283 if ((skb = __skb_dequeue(queue)) == NULL)
1257 goto out; 1284 goto out;
1258 tail_skb = &(skb_shinfo(skb)->frag_list); 1285 tail_skb = &(skb_shinfo(skb)->frag_list);
1259 1286
1260 /* move skb->data to ip header from ext header */ 1287 /* move skb->data to ip header from ext header */
1261 if (skb->data < skb_network_header(skb)) 1288 if (skb->data < skb_network_header(skb))
1262 __skb_pull(skb, skb_network_offset(skb)); 1289 __skb_pull(skb, skb_network_offset(skb));
1263 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { 1290 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1264 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1291 __skb_pull(tmp_skb, skb_network_header_len(skb));
1265 *tail_skb = tmp_skb; 1292 *tail_skb = tmp_skb;
1266 tail_skb = &(tmp_skb->next); 1293 tail_skb = &(tmp_skb->next);
@@ -1286,8 +1313,8 @@ int ip_push_pending_frames(struct sock *sk)
1286 ip_dont_fragment(sk, &rt->dst))) 1313 ip_dont_fragment(sk, &rt->dst)))
1287 df = htons(IP_DF); 1314 df = htons(IP_DF);
1288 1315
1289 if (inet->cork.flags & IPCORK_OPT) 1316 if (cork->flags & IPCORK_OPT)
1290 opt = inet->cork.opt; 1317 opt = cork->opt;
1291 1318
1292 if (rt->rt_type == RTN_MULTICAST) 1319 if (rt->rt_type == RTN_MULTICAST)
1293 ttl = inet->mc_ttl; 1320 ttl = inet->mc_ttl;
@@ -1297,17 +1324,18 @@ int ip_push_pending_frames(struct sock *sk)
1297 iph = (struct iphdr *)skb->data; 1324 iph = (struct iphdr *)skb->data;
1298 iph->version = 4; 1325 iph->version = 4;
1299 iph->ihl = 5; 1326 iph->ihl = 5;
1300 if (opt) {
1301 iph->ihl += opt->optlen>>2;
1302 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1303 }
1304 iph->tos = inet->tos; 1327 iph->tos = inet->tos;
1305 iph->frag_off = df; 1328 iph->frag_off = df;
1306 ip_select_ident(iph, &rt->dst, sk); 1329 ip_select_ident(iph, &rt->dst, sk);
1307 iph->ttl = ttl; 1330 iph->ttl = ttl;
1308 iph->protocol = sk->sk_protocol; 1331 iph->protocol = sk->sk_protocol;
1309 iph->saddr = rt->rt_src; 1332 iph->saddr = fl4->saddr;
1310 iph->daddr = rt->rt_dst; 1333 iph->daddr = fl4->daddr;
1334
1335 if (opt) {
1336 iph->ihl += opt->optlen>>2;
1337 ip_options_build(skb, opt, cork->addr, rt, 0);
1338 }
1311 1339
1312 skb->priority = sk->sk_priority; 1340 skb->priority = sk->sk_priority;
1313 skb->mark = sk->sk_mark; 1341 skb->mark = sk->sk_mark;
@@ -1315,44 +1343,99 @@ int ip_push_pending_frames(struct sock *sk)
1315 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec 1343 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1316 * on dst refcount 1344 * on dst refcount
1317 */ 1345 */
1318 inet->cork.dst = NULL; 1346 cork->dst = NULL;
1319 skb_dst_set(skb, &rt->dst); 1347 skb_dst_set(skb, &rt->dst);
1320 1348
1321 if (iph->protocol == IPPROTO_ICMP) 1349 if (iph->protocol == IPPROTO_ICMP)
1322 icmp_out_count(net, ((struct icmphdr *) 1350 icmp_out_count(net, ((struct icmphdr *)
1323 skb_transport_header(skb))->type); 1351 skb_transport_header(skb))->type);
1324 1352
1325 /* Netfilter gets whole the not fragmented skb. */ 1353 ip_cork_release(cork);
1354out:
1355 return skb;
1356}
1357
1358int ip_send_skb(struct sk_buff *skb)
1359{
1360 struct net *net = sock_net(skb->sk);
1361 int err;
1362
1326 err = ip_local_out(skb); 1363 err = ip_local_out(skb);
1327 if (err) { 1364 if (err) {
1328 if (err > 0) 1365 if (err > 0)
1329 err = net_xmit_errno(err); 1366 err = net_xmit_errno(err);
1330 if (err) 1367 if (err)
1331 goto error; 1368 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1332 } 1369 }
1333 1370
1334out:
1335 ip_cork_release(inet);
1336 return err; 1371 return err;
1372}
1337 1373
1338error: 1374int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1339 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); 1375{
1340 goto out; 1376 struct sk_buff *skb;
1377
1378 skb = ip_finish_skb(sk, fl4);
1379 if (!skb)
1380 return 0;
1381
1382 /* Netfilter gets whole the not fragmented skb. */
1383 return ip_send_skb(skb);
1341} 1384}
1342 1385
1343/* 1386/*
1344 * Throw away all pending data on the socket. 1387 * Throw away all pending data on the socket.
1345 */ 1388 */
1346void ip_flush_pending_frames(struct sock *sk) 1389static void __ip_flush_pending_frames(struct sock *sk,
1390 struct sk_buff_head *queue,
1391 struct inet_cork *cork)
1347{ 1392{
1348 struct sk_buff *skb; 1393 struct sk_buff *skb;
1349 1394
1350 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) 1395 while ((skb = __skb_dequeue_tail(queue)) != NULL)
1351 kfree_skb(skb); 1396 kfree_skb(skb);
1352 1397
1353 ip_cork_release(inet_sk(sk)); 1398 ip_cork_release(cork);
1354} 1399}
1355 1400
1401void ip_flush_pending_frames(struct sock *sk)
1402{
1403 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1404}
1405
1406struct sk_buff *ip_make_skb(struct sock *sk,
1407 struct flowi4 *fl4,
1408 int getfrag(void *from, char *to, int offset,
1409 int len, int odd, struct sk_buff *skb),
1410 void *from, int length, int transhdrlen,
1411 struct ipcm_cookie *ipc, struct rtable **rtp,
1412 unsigned int flags)
1413{
1414 struct inet_cork cork;
1415 struct sk_buff_head queue;
1416 int err;
1417
1418 if (flags & MSG_PROBE)
1419 return NULL;
1420
1421 __skb_queue_head_init(&queue);
1422
1423 cork.flags = 0;
1424 cork.addr = 0;
1425 cork.opt = NULL;
1426 err = ip_setup_cork(sk, &cork, ipc, rtp);
1427 if (err)
1428 return ERR_PTR(err);
1429
1430 err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
1431 from, length, transhdrlen, flags);
1432 if (err) {
1433 __ip_flush_pending_frames(sk, &queue, &cork);
1434 return ERR_PTR(err);
1435 }
1436
1437 return __ip_make_skb(sk, fl4, &queue, &cork);
1438}
1356 1439
1357/* 1440/*
1358 * Fetch data from kernel space and fill in checksum if needed. 1441 * Fetch data from kernel space and fill in checksum if needed.
@@ -1374,48 +1457,39 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1374 * Should run single threaded per socket because it uses the sock 1457 * Should run single threaded per socket because it uses the sock
1375 * structure to pass arguments. 1458 * structure to pass arguments.
1376 */ 1459 */
1377void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg, 1460void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1378 unsigned int len) 1461 struct ip_reply_arg *arg, unsigned int len)
1379{ 1462{
1380 struct inet_sock *inet = inet_sk(sk); 1463 struct inet_sock *inet = inet_sk(sk);
1381 struct { 1464 struct ip_options_data replyopts;
1382 struct ip_options opt;
1383 char data[40];
1384 } replyopts;
1385 struct ipcm_cookie ipc; 1465 struct ipcm_cookie ipc;
1386 __be32 daddr; 1466 struct flowi4 fl4;
1387 struct rtable *rt = skb_rtable(skb); 1467 struct rtable *rt = skb_rtable(skb);
1388 1468
1389 if (ip_options_echo(&replyopts.opt, skb)) 1469 if (ip_options_echo(&replyopts.opt.opt, skb))
1390 return; 1470 return;
1391 1471
1392 daddr = ipc.addr = rt->rt_src; 1472 ipc.addr = daddr;
1393 ipc.opt = NULL; 1473 ipc.opt = NULL;
1394 ipc.shtx.flags = 0; 1474 ipc.tx_flags = 0;
1395 1475
1396 if (replyopts.opt.optlen) { 1476 if (replyopts.opt.opt.optlen) {
1397 ipc.opt = &replyopts.opt; 1477 ipc.opt = &replyopts.opt;
1398 1478
1399 if (ipc.opt->srr) 1479 if (replyopts.opt.opt.srr)
1400 daddr = replyopts.opt.faddr; 1480 daddr = replyopts.opt.opt.faddr;
1401 } 1481 }
1402 1482
1403 { 1483 flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1404 struct flowi fl = { .oif = arg->bound_dev_if, 1484 RT_TOS(ip_hdr(skb)->tos),
1405 .nl_u = { .ip4_u = 1485 RT_SCOPE_UNIVERSE, sk->sk_protocol,
1406 { .daddr = daddr, 1486 ip_reply_arg_flowi_flags(arg),
1407 .saddr = rt->rt_spec_dst, 1487 daddr, rt->rt_spec_dst,
1408 .tos = RT_TOS(ip_hdr(skb)->tos) } }, 1488 tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1409 /* Not quite clean, but right. */ 1489 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1410 .uli_u = { .ports = 1490 rt = ip_route_output_key(sock_net(sk), &fl4);
1411 { .sport = tcp_hdr(skb)->dest, 1491 if (IS_ERR(rt))
1412 .dport = tcp_hdr(skb)->source } }, 1492 return;
1413 .proto = sk->sk_protocol,
1414 .flags = ip_reply_arg_flowi_flags(arg) };
1415 security_skb_classify_flow(skb, &fl);
1416 if (ip_route_output_key(sock_net(sk), &rt, &fl))
1417 return;
1418 }
1419 1493
1420 /* And let IP do all the hard work. 1494 /* And let IP do all the hard work.
1421 1495
@@ -1428,7 +1502,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
1428 sk->sk_priority = skb->priority; 1502 sk->sk_priority = skb->priority;
1429 sk->sk_protocol = ip_hdr(skb)->protocol; 1503 sk->sk_protocol = ip_hdr(skb)->protocol;
1430 sk->sk_bound_dev_if = arg->bound_dev_if; 1504 sk->sk_bound_dev_if = arg->bound_dev_if;
1431 ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0, 1505 ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1432 &ipc, &rt, MSG_DONTWAIT); 1506 &ipc, &rt, MSG_DONTWAIT);
1433 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { 1507 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1434 if (arg->csumoffset >= 0) 1508 if (arg->csumoffset >= 0)
@@ -1436,7 +1510,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
1436 arg->csumoffset) = csum_fold(csum_add(skb->csum, 1510 arg->csumoffset) = csum_fold(csum_add(skb->csum,
1437 arg->csum)); 1511 arg->csum));
1438 skb->ip_summed = CHECKSUM_NONE; 1512 skb->ip_summed = CHECKSUM_NONE;
1439 ip_push_pending_frames(sk); 1513 ip_push_pending_frames(sk, &fl4);
1440 } 1514 }
1441 1515
1442 bh_unlock_sock(sk); 1516 bh_unlock_sock(sk);