aboutsummaryrefslogtreecommitdiffstats
path: root/include/net/tcp.h
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-10-08 21:40:54 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-10-08 21:40:54 -0400
commit35a9ad8af0bb0fa3525e6d0d20e32551d226f38e (patch)
tree15b4b33206818886d9cff371fd2163e073b70568 /include/net/tcp.h
parentd5935b07da53f74726e2a65dd4281d0f2c70e5d4 (diff)
parent64b1f00a0830e1c53874067273a096b228d83d36 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: "Most notable changes in here: 1) By far the biggest accomplishment, thanks to a large range of contributors, is the addition of multi-send for transmit. This is the result of discussions back in Chicago, and the hard work of several individuals. Now, when the ->ndo_start_xmit() method of a driver sees skb->xmit_more as true, it can choose to defer the doorbell telling the driver to start processing the new TX queue entires. skb->xmit_more means that the generic networking is guaranteed to call the driver immediately with another SKB to send. There is logic added to the qdisc layer to dequeue multiple packets at a time, and the handling mis-predicted offloads in software is now done with no locks held. Finally, pktgen is extended to have a "burst" parameter that can be used to test a multi-send implementation. Several drivers have xmit_more support: i40e, igb, ixgbe, mlx4, virtio_net Adding support is almost trivial, so export more drivers to support this optimization soon. I want to thank, in no particular or implied order, Jesper Dangaard Brouer, Eric Dumazet, Alexander Duyck, Tom Herbert, Jamal Hadi Salim, John Fastabend, Florian Westphal, Daniel Borkmann, David Tat, Hannes Frederic Sowa, and Rusty Russell. 2) PTP and timestamping support in bnx2x, from Michal Kalderon. 3) Allow adjusting the rx_copybreak threshold for a driver via ethtool, and add rx_copybreak support to enic driver. From Govindarajulu Varadarajan. 4) Significant enhancements to the generic PHY layer and the bcm7xxx driver in particular (EEE support, auto power down, etc.) from Florian Fainelli. 5) Allow raw buffers to be used for flow dissection, allowing drivers to determine the optimal "linear pull" size for devices that DMA into pools of pages. The objective is to get exactly the necessary amount of headers into the linear SKB area pre-pulled, but no more. The new interface drivers use is eth_get_headlen(). From WANG Cong, with driver conversions (several had their own by-hand duplicated implementations) by Alexander Duyck and Eric Dumazet. 6) Support checksumming more smoothly and efficiently for encapsulations, and add "foo over UDP" facility. From Tom Herbert. 7) Add Broadcom SF2 switch driver to DSA layer, from Florian Fainelli. 8) eBPF now can load programs via a system call and has an extensive testsuite. Alexei Starovoitov and Daniel Borkmann. 9) Major overhaul of the packet scheduler to use RCU in several major areas such as the classifiers and rate estimators. From John Fastabend. 10) Add driver for Intel FM10000 Ethernet Switch, from Alexander Duyck. 11) Rearrange TCP_SKB_CB() to reduce cache line misses, from Eric Dumazet. 12) Add Datacenter TCP congestion control algorithm support, From Florian Westphal. 13) Reorganize sk_buff so that __copy_skb_header() is significantly faster. From Eric Dumazet" * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1558 commits) netlabel: directly return netlbl_unlabel_genl_init() net: add netdev_txq_bql_{enqueue, complete}_prefetchw() helpers net: description of dma_cookie cause make xmldocs warning cxgb4: clean up a type issue cxgb4: potential shift wrapping bug i40e: skb->xmit_more support net: fs_enet: Add NAPI TX net: fs_enet: Remove non NAPI RX r8169:add support for RTL8168EP net_sched: copy exts->type in tcf_exts_change() wimax: convert printk to pr_foo() af_unix: remove 0 assignment on static ipv6: Do not warn for informational ICMP messages, regardless of type. Update Intel Ethernet Driver maintainers list bridge: Save frag_max_size between PRE_ROUTING and POST_ROUTING tipc: fix bug in multicast congestion handling net: better IFF_XMIT_DST_RELEASE support net/mlx4_en: remove NETDEV_TX_BUSY 3c59x: fix bad split of cpu_to_le32(pci_map_single()) net: bcmgenet: fix Tx ring priority programming ...
Diffstat (limited to 'include/net/tcp.h')
-rw-r--r--include/net/tcp.h85
1 files changed, 57 insertions, 28 deletions
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7523c325673e..74efeda994b3 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -669,6 +669,12 @@ void tcp_send_window_probe(struct sock *sk);
669 */ 669 */
670#define tcp_time_stamp ((__u32)(jiffies)) 670#define tcp_time_stamp ((__u32)(jiffies))
671 671
672static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
673{
674 return skb->skb_mstamp.stamp_jiffies;
675}
676
677
672#define tcp_flag_byte(th) (((u_int8_t *)th)[13]) 678#define tcp_flag_byte(th) (((u_int8_t *)th)[13])
673 679
674#define TCPHDR_FIN 0x01 680#define TCPHDR_FIN 0x01
@@ -687,15 +693,18 @@ void tcp_send_window_probe(struct sock *sk);
687 * If this grows please adjust skbuff.h:skbuff->cb[xxx] size appropriately. 693 * If this grows please adjust skbuff.h:skbuff->cb[xxx] size appropriately.
688 */ 694 */
689struct tcp_skb_cb { 695struct tcp_skb_cb {
690 union {
691 struct inet_skb_parm h4;
692#if IS_ENABLED(CONFIG_IPV6)
693 struct inet6_skb_parm h6;
694#endif
695 } header; /* For incoming frames */
696 __u32 seq; /* Starting sequence number */ 696 __u32 seq; /* Starting sequence number */
697 __u32 end_seq; /* SEQ + FIN + SYN + datalen */ 697 __u32 end_seq; /* SEQ + FIN + SYN + datalen */
698 __u32 when; /* used to compute rtt's */ 698 union {
699 /* Note : tcp_tw_isn is used in input path only
700 * (isn chosen by tcp_timewait_state_process())
701 *
702 * tcp_gso_segs is used in write queue only,
703 * cf tcp_skb_pcount()
704 */
705 __u32 tcp_tw_isn;
706 __u32 tcp_gso_segs;
707 };
699 __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ 708 __u8 tcp_flags; /* TCP header flags. (tcp[13]) */
700 709
701 __u8 sacked; /* State flags for SACK/FACK. */ 710 __u8 sacked; /* State flags for SACK/FACK. */
@@ -711,33 +720,32 @@ struct tcp_skb_cb {
711 __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */ 720 __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */
712 /* 1 byte hole */ 721 /* 1 byte hole */
713 __u32 ack_seq; /* Sequence number ACK'd */ 722 __u32 ack_seq; /* Sequence number ACK'd */
723 union {
724 struct inet_skb_parm h4;
725#if IS_ENABLED(CONFIG_IPV6)
726 struct inet6_skb_parm h6;
727#endif
728 } header; /* For incoming frames */
714}; 729};
715 730
716#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) 731#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
717 732
718/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set 733/* Due to TSO, an SKB can be composed of multiple actual
719 * 734 * packets. To keep these tracked properly, we use this.
720 * If we receive a SYN packet with these bits set, it means a network is
721 * playing bad games with TOS bits. In order to avoid possible false congestion
722 * notifications, we disable TCP ECN negociation.
723 */ 735 */
724static inline void 736static inline int tcp_skb_pcount(const struct sk_buff *skb)
725TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb,
726 struct net *net)
727{ 737{
728 const struct tcphdr *th = tcp_hdr(skb); 738 return TCP_SKB_CB(skb)->tcp_gso_segs;
739}
729 740
730 if (net->ipv4.sysctl_tcp_ecn && th->ece && th->cwr && 741static inline void tcp_skb_pcount_set(struct sk_buff *skb, int segs)
731 INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield)) 742{
732 inet_rsk(req)->ecn_ok = 1; 743 TCP_SKB_CB(skb)->tcp_gso_segs = segs;
733} 744}
734 745
735/* Due to TSO, an SKB can be composed of multiple actual 746static inline void tcp_skb_pcount_add(struct sk_buff *skb, int segs)
736 * packets. To keep these tracked properly, we use this.
737 */
738static inline int tcp_skb_pcount(const struct sk_buff *skb)
739{ 747{
740 return skb_shinfo(skb)->gso_segs; 748 TCP_SKB_CB(skb)->tcp_gso_segs += segs;
741} 749}
742 750
743/* This is valid iff tcp_skb_pcount() > 1. */ 751/* This is valid iff tcp_skb_pcount() > 1. */
@@ -752,8 +760,17 @@ enum tcp_ca_event {
752 CA_EVENT_CWND_RESTART, /* congestion window restart */ 760 CA_EVENT_CWND_RESTART, /* congestion window restart */
753 CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */ 761 CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */
754 CA_EVENT_LOSS, /* loss timeout */ 762 CA_EVENT_LOSS, /* loss timeout */
755 CA_EVENT_FAST_ACK, /* in sequence ack */ 763 CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */
756 CA_EVENT_SLOW_ACK, /* other ack */ 764 CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */
765 CA_EVENT_DELAYED_ACK, /* Delayed ack is sent */
766 CA_EVENT_NON_DELAYED_ACK,
767};
768
769/* Information about inbound ACK, passed to cong_ops->in_ack_event() */
770enum tcp_ca_ack_event_flags {
771 CA_ACK_SLOWPATH = (1 << 0), /* In slow path processing */
772 CA_ACK_WIN_UPDATE = (1 << 1), /* ACK updated window */
773 CA_ACK_ECE = (1 << 2), /* ECE bit is set on ack */
757}; 774};
758 775
759/* 776/*
@@ -763,7 +780,10 @@ enum tcp_ca_event {
763#define TCP_CA_MAX 128 780#define TCP_CA_MAX 128
764#define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX) 781#define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX)
765 782
783/* Algorithm can be set on socket without CAP_NET_ADMIN privileges */
766#define TCP_CONG_NON_RESTRICTED 0x1 784#define TCP_CONG_NON_RESTRICTED 0x1
785/* Requires ECN/ECT set on all packets */
786#define TCP_CONG_NEEDS_ECN 0x2
767 787
768struct tcp_congestion_ops { 788struct tcp_congestion_ops {
769 struct list_head list; 789 struct list_head list;
@@ -782,6 +802,8 @@ struct tcp_congestion_ops {
782 void (*set_state)(struct sock *sk, u8 new_state); 802 void (*set_state)(struct sock *sk, u8 new_state);
783 /* call when cwnd event occurs (optional) */ 803 /* call when cwnd event occurs (optional) */
784 void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev); 804 void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);
805 /* call when ack arrives (optional) */
806 void (*in_ack_event)(struct sock *sk, u32 flags);
785 /* new value of cwnd after loss (optional) */ 807 /* new value of cwnd after loss (optional) */
786 u32 (*undo_cwnd)(struct sock *sk); 808 u32 (*undo_cwnd)(struct sock *sk);
787 /* hook for packet ack accounting (optional) */ 809 /* hook for packet ack accounting (optional) */
@@ -796,6 +818,7 @@ struct tcp_congestion_ops {
796int tcp_register_congestion_control(struct tcp_congestion_ops *type); 818int tcp_register_congestion_control(struct tcp_congestion_ops *type);
797void tcp_unregister_congestion_control(struct tcp_congestion_ops *type); 819void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);
798 820
821void tcp_assign_congestion_control(struct sock *sk);
799void tcp_init_congestion_control(struct sock *sk); 822void tcp_init_congestion_control(struct sock *sk);
800void tcp_cleanup_congestion_control(struct sock *sk); 823void tcp_cleanup_congestion_control(struct sock *sk);
801int tcp_set_default_congestion_control(const char *name); 824int tcp_set_default_congestion_control(const char *name);
@@ -804,14 +827,20 @@ void tcp_get_available_congestion_control(char *buf, size_t len);
804void tcp_get_allowed_congestion_control(char *buf, size_t len); 827void tcp_get_allowed_congestion_control(char *buf, size_t len);
805int tcp_set_allowed_congestion_control(char *allowed); 828int tcp_set_allowed_congestion_control(char *allowed);
806int tcp_set_congestion_control(struct sock *sk, const char *name); 829int tcp_set_congestion_control(struct sock *sk, const char *name);
807int tcp_slow_start(struct tcp_sock *tp, u32 acked); 830void tcp_slow_start(struct tcp_sock *tp, u32 acked);
808void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w); 831void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w);
809 832
810extern struct tcp_congestion_ops tcp_init_congestion_ops;
811u32 tcp_reno_ssthresh(struct sock *sk); 833u32 tcp_reno_ssthresh(struct sock *sk);
812void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); 834void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
813extern struct tcp_congestion_ops tcp_reno; 835extern struct tcp_congestion_ops tcp_reno;
814 836
837static inline bool tcp_ca_needs_ecn(const struct sock *sk)
838{
839 const struct inet_connection_sock *icsk = inet_csk(sk);
840
841 return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN;
842}
843
815static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state) 844static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state)
816{ 845{
817 struct inet_connection_sock *icsk = inet_csk(sk); 846 struct inet_connection_sock *icsk = inet_csk(sk);