aboutsummaryrefslogtreecommitdiffstats
path: root/net/core/skbuff.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/core/skbuff.c')
-rw-r--r--net/core/skbuff.c172
1 files changed, 127 insertions, 45 deletions
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 1e3e0087245b..f86bf69cfb8d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -72,7 +72,7 @@
72#include <net/ip6_checksum.h> 72#include <net/ip6_checksum.h>
73#include <net/xfrm.h> 73#include <net/xfrm.h>
74 74
75#include <asm/uaccess.h> 75#include <linux/uaccess.h>
76#include <trace/events/skb.h> 76#include <trace/events/skb.h>
77#include <linux/highmem.h> 77#include <linux/highmem.h>
78#include <linux/capability.h> 78#include <linux/capability.h>
@@ -271,7 +271,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
271 atomic_set(&fclones->fclone_ref, 1); 271 atomic_set(&fclones->fclone_ref, 1);
272 272
273 fclones->skb2.fclone = SKB_FCLONE_CLONE; 273 fclones->skb2.fclone = SKB_FCLONE_CLONE;
274 fclones->skb2.pfmemalloc = pfmemalloc;
275 } 274 }
276out: 275out:
277 return skb; 276 return skb;
@@ -354,7 +353,7 @@ EXPORT_SYMBOL(build_skb);
354 353
355struct napi_alloc_cache { 354struct napi_alloc_cache {
356 struct page_frag_cache page; 355 struct page_frag_cache page;
357 size_t skb_count; 356 unsigned int skb_count;
358 void *skb_cache[NAPI_SKB_CACHE_SIZE]; 357 void *skb_cache[NAPI_SKB_CACHE_SIZE];
359}; 358};
360 359
@@ -369,7 +368,7 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
369 368
370 local_irq_save(flags); 369 local_irq_save(flags);
371 nc = this_cpu_ptr(&netdev_alloc_cache); 370 nc = this_cpu_ptr(&netdev_alloc_cache);
372 data = __alloc_page_frag(nc, fragsz, gfp_mask); 371 data = page_frag_alloc(nc, fragsz, gfp_mask);
373 local_irq_restore(flags); 372 local_irq_restore(flags);
374 return data; 373 return data;
375} 374}
@@ -391,7 +390,7 @@ static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
391{ 390{
392 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); 391 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
393 392
394 return __alloc_page_frag(&nc->page, fragsz, gfp_mask); 393 return page_frag_alloc(&nc->page, fragsz, gfp_mask);
395} 394}
396 395
397void *napi_alloc_frag(unsigned int fragsz) 396void *napi_alloc_frag(unsigned int fragsz)
@@ -441,7 +440,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
441 local_irq_save(flags); 440 local_irq_save(flags);
442 441
443 nc = this_cpu_ptr(&netdev_alloc_cache); 442 nc = this_cpu_ptr(&netdev_alloc_cache);
444 data = __alloc_page_frag(nc, len, gfp_mask); 443 data = page_frag_alloc(nc, len, gfp_mask);
445 pfmemalloc = nc->pfmemalloc; 444 pfmemalloc = nc->pfmemalloc;
446 445
447 local_irq_restore(flags); 446 local_irq_restore(flags);
@@ -505,7 +504,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
505 if (sk_memalloc_socks()) 504 if (sk_memalloc_socks())
506 gfp_mask |= __GFP_MEMALLOC; 505 gfp_mask |= __GFP_MEMALLOC;
507 506
508 data = __alloc_page_frag(&nc->page, len, gfp_mask); 507 data = page_frag_alloc(&nc->page, len, gfp_mask);
509 if (unlikely(!data)) 508 if (unlikely(!data))
510 return NULL; 509 return NULL;
511 510
@@ -655,7 +654,7 @@ static void skb_release_head_state(struct sk_buff *skb)
655 skb->destructor(skb); 654 skb->destructor(skb);
656 } 655 }
657#if IS_ENABLED(CONFIG_NF_CONNTRACK) 656#if IS_ENABLED(CONFIG_NF_CONNTRACK)
658 nf_conntrack_put(skb->nfct); 657 nf_conntrack_put(skb_nfct(skb));
659#endif 658#endif
660#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) 659#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
661 nf_bridge_put(skb->nf_bridge); 660 nf_bridge_put(skb->nf_bridge);
@@ -878,9 +877,6 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
878#endif 877#endif
879#ifdef CONFIG_NET_SCHED 878#ifdef CONFIG_NET_SCHED
880 CHECK_SKB_FIELD(tc_index); 879 CHECK_SKB_FIELD(tc_index);
881#ifdef CONFIG_NET_CLS_ACT
882 CHECK_SKB_FIELD(tc_verd);
883#endif
884#endif 880#endif
885 881
886} 882}
@@ -1195,10 +1191,10 @@ EXPORT_SYMBOL(__pskb_copy_fclone);
1195int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, 1191int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
1196 gfp_t gfp_mask) 1192 gfp_t gfp_mask)
1197{ 1193{
1198 int i; 1194 int i, osize = skb_end_offset(skb);
1199 u8 *data; 1195 int size = osize + nhead + ntail;
1200 int size = nhead + skb_end_offset(skb) + ntail;
1201 long off; 1196 long off;
1197 u8 *data;
1202 1198
1203 BUG_ON(nhead < 0); 1199 BUG_ON(nhead < 0);
1204 1200
@@ -1260,6 +1256,14 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
1260 skb->hdr_len = 0; 1256 skb->hdr_len = 0;
1261 skb->nohdr = 0; 1257 skb->nohdr = 0;
1262 atomic_set(&skb_shinfo(skb)->dataref, 1); 1258 atomic_set(&skb_shinfo(skb)->dataref, 1);
1259
1260 /* It is not generally safe to change skb->truesize.
1261 * For the moment, we really care of rx path, or
1262 * when skb is orphaned (not attached to a socket).
1263 */
1264 if (!skb->sk || skb->destructor == sock_edemux)
1265 skb->truesize += size - osize;
1266
1263 return 0; 1267 return 0;
1264 1268
1265nofrags: 1269nofrags:
@@ -2656,7 +2660,9 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
2656 struct skb_frag_struct *fragfrom, *fragto; 2660 struct skb_frag_struct *fragfrom, *fragto;
2657 2661
2658 BUG_ON(shiftlen > skb->len); 2662 BUG_ON(shiftlen > skb->len);
2659 BUG_ON(skb_headlen(skb)); /* Would corrupt stream */ 2663
2664 if (skb_headlen(skb))
2665 return 0;
2660 2666
2661 todo = shiftlen; 2667 todo = shiftlen;
2662 from = 0; 2668 from = 0;
@@ -3076,22 +3082,32 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
3076 if (sg && csum && (mss != GSO_BY_FRAGS)) { 3082 if (sg && csum && (mss != GSO_BY_FRAGS)) {
3077 if (!(features & NETIF_F_GSO_PARTIAL)) { 3083 if (!(features & NETIF_F_GSO_PARTIAL)) {
3078 struct sk_buff *iter; 3084 struct sk_buff *iter;
3085 unsigned int frag_len;
3079 3086
3080 if (!list_skb || 3087 if (!list_skb ||
3081 !net_gso_ok(features, skb_shinfo(head_skb)->gso_type)) 3088 !net_gso_ok(features, skb_shinfo(head_skb)->gso_type))
3082 goto normal; 3089 goto normal;
3083 3090
3084 /* Split the buffer at the frag_list pointer. 3091 /* If we get here then all the required
3085 * This is based on the assumption that all 3092 * GSO features except frag_list are supported.
3086 * buffers in the chain excluding the last 3093 * Try to split the SKB to multiple GSO SKBs
3087 * containing the same amount of data. 3094 * with no frag_list.
3095 * Currently we can do that only when the buffers don't
3096 * have a linear part and all the buffers except
3097 * the last are of the same length.
3088 */ 3098 */
3099 frag_len = list_skb->len;
3089 skb_walk_frags(head_skb, iter) { 3100 skb_walk_frags(head_skb, iter) {
3101 if (frag_len != iter->len && iter->next)
3102 goto normal;
3090 if (skb_headlen(iter)) 3103 if (skb_headlen(iter))
3091 goto normal; 3104 goto normal;
3092 3105
3093 len -= iter->len; 3106 len -= iter->len;
3094 } 3107 }
3108
3109 if (len != frag_len)
3110 goto normal;
3095 } 3111 }
3096 3112
3097 /* GSO partial only requires that we trim off any excess that 3113 /* GSO partial only requires that we trim off any excess that
@@ -3688,6 +3704,15 @@ static void sock_rmem_free(struct sk_buff *skb)
3688 atomic_sub(skb->truesize, &sk->sk_rmem_alloc); 3704 atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
3689} 3705}
3690 3706
3707static void skb_set_err_queue(struct sk_buff *skb)
3708{
3709 /* pkt_type of skbs received on local sockets is never PACKET_OUTGOING.
3710 * So, it is safe to (mis)use it to mark skbs on the error queue.
3711 */
3712 skb->pkt_type = PACKET_OUTGOING;
3713 BUILD_BUG_ON(PACKET_OUTGOING == 0);
3714}
3715
3691/* 3716/*
3692 * Note: We dont mem charge error packets (no sk_forward_alloc changes) 3717 * Note: We dont mem charge error packets (no sk_forward_alloc changes)
3693 */ 3718 */
@@ -3701,6 +3726,7 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
3701 skb->sk = sk; 3726 skb->sk = sk;
3702 skb->destructor = sock_rmem_free; 3727 skb->destructor = sock_rmem_free;
3703 atomic_add(skb->truesize, &sk->sk_rmem_alloc); 3728 atomic_add(skb->truesize, &sk->sk_rmem_alloc);
3729 skb_set_err_queue(skb);
3704 3730
3705 /* before exiting rcu section, make sure dst is refcounted */ 3731 /* before exiting rcu section, make sure dst is refcounted */
3706 skb_dst_force(skb); 3732 skb_dst_force(skb);
@@ -3712,21 +3738,29 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
3712} 3738}
3713EXPORT_SYMBOL(sock_queue_err_skb); 3739EXPORT_SYMBOL(sock_queue_err_skb);
3714 3740
3741static bool is_icmp_err_skb(const struct sk_buff *skb)
3742{
3743 return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
3744 SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6);
3745}
3746
3715struct sk_buff *sock_dequeue_err_skb(struct sock *sk) 3747struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
3716{ 3748{
3717 struct sk_buff_head *q = &sk->sk_error_queue; 3749 struct sk_buff_head *q = &sk->sk_error_queue;
3718 struct sk_buff *skb, *skb_next; 3750 struct sk_buff *skb, *skb_next = NULL;
3751 bool icmp_next = false;
3719 unsigned long flags; 3752 unsigned long flags;
3720 int err = 0;
3721 3753
3722 spin_lock_irqsave(&q->lock, flags); 3754 spin_lock_irqsave(&q->lock, flags);
3723 skb = __skb_dequeue(q); 3755 skb = __skb_dequeue(q);
3724 if (skb && (skb_next = skb_peek(q))) 3756 if (skb && (skb_next = skb_peek(q)))
3725 err = SKB_EXT_ERR(skb_next)->ee.ee_errno; 3757 icmp_next = is_icmp_err_skb(skb_next);
3726 spin_unlock_irqrestore(&q->lock, flags); 3758 spin_unlock_irqrestore(&q->lock, flags);
3727 3759
3728 sk->sk_err = err; 3760 if (is_icmp_err_skb(skb) && !icmp_next)
3729 if (err) 3761 sk->sk_err = 0;
3762
3763 if (skb_next)
3730 sk->sk_error_report(sk); 3764 sk->sk_error_report(sk);
3731 3765
3732 return skb; 3766 return skb;
@@ -3769,16 +3803,21 @@ EXPORT_SYMBOL(skb_clone_sk);
3769 3803
3770static void __skb_complete_tx_timestamp(struct sk_buff *skb, 3804static void __skb_complete_tx_timestamp(struct sk_buff *skb,
3771 struct sock *sk, 3805 struct sock *sk,
3772 int tstype) 3806 int tstype,
3807 bool opt_stats)
3773{ 3808{
3774 struct sock_exterr_skb *serr; 3809 struct sock_exterr_skb *serr;
3775 int err; 3810 int err;
3776 3811
3812 BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb));
3813
3777 serr = SKB_EXT_ERR(skb); 3814 serr = SKB_EXT_ERR(skb);
3778 memset(serr, 0, sizeof(*serr)); 3815 memset(serr, 0, sizeof(*serr));
3779 serr->ee.ee_errno = ENOMSG; 3816 serr->ee.ee_errno = ENOMSG;
3780 serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; 3817 serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
3781 serr->ee.ee_info = tstype; 3818 serr->ee.ee_info = tstype;
3819 serr->opt_stats = opt_stats;
3820 serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0;
3782 if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { 3821 if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
3783 serr->ee.ee_data = skb_shinfo(skb)->tskey; 3822 serr->ee.ee_data = skb_shinfo(skb)->tskey;
3784 if (sk->sk_protocol == IPPROTO_TCP && 3823 if (sk->sk_protocol == IPPROTO_TCP &&
@@ -3814,13 +3853,14 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
3814 if (!skb_may_tx_timestamp(sk, false)) 3853 if (!skb_may_tx_timestamp(sk, false))
3815 return; 3854 return;
3816 3855
3817 /* take a reference to prevent skb_orphan() from freeing the socket */ 3856 /* Take a reference to prevent skb_orphan() from freeing the socket,
3818 sock_hold(sk); 3857 * but only if the socket refcount is not zero.
3819 3858 */
3820 *skb_hwtstamps(skb) = *hwtstamps; 3859 if (likely(atomic_inc_not_zero(&sk->sk_refcnt))) {
3821 __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND); 3860 *skb_hwtstamps(skb) = *hwtstamps;
3822 3861 __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false);
3823 sock_put(sk); 3862 sock_put(sk);
3863 }
3824} 3864}
3825EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); 3865EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
3826 3866
@@ -3829,7 +3869,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
3829 struct sock *sk, int tstype) 3869 struct sock *sk, int tstype)
3830{ 3870{
3831 struct sk_buff *skb; 3871 struct sk_buff *skb;
3832 bool tsonly; 3872 bool tsonly, opt_stats = false;
3833 3873
3834 if (!sk) 3874 if (!sk)
3835 return; 3875 return;
@@ -3838,10 +3878,19 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
3838 if (!skb_may_tx_timestamp(sk, tsonly)) 3878 if (!skb_may_tx_timestamp(sk, tsonly))
3839 return; 3879 return;
3840 3880
3841 if (tsonly) 3881 if (tsonly) {
3842 skb = alloc_skb(0, GFP_ATOMIC); 3882#ifdef CONFIG_INET
3843 else 3883 if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
3884 sk->sk_protocol == IPPROTO_TCP &&
3885 sk->sk_type == SOCK_STREAM) {
3886 skb = tcp_get_timestamping_opt_stats(sk);
3887 opt_stats = true;
3888 } else
3889#endif
3890 skb = alloc_skb(0, GFP_ATOMIC);
3891 } else {
3844 skb = skb_clone(orig_skb, GFP_ATOMIC); 3892 skb = skb_clone(orig_skb, GFP_ATOMIC);
3893 }
3845 if (!skb) 3894 if (!skb)
3846 return; 3895 return;
3847 3896
@@ -3855,7 +3904,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
3855 else 3904 else
3856 skb->tstamp = ktime_get_real(); 3905 skb->tstamp = ktime_get_real();
3857 3906
3858 __skb_complete_tx_timestamp(skb, sk, tstype); 3907 __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
3859} 3908}
3860EXPORT_SYMBOL_GPL(__skb_tstamp_tx); 3909EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
3861 3910
@@ -3871,7 +3920,7 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
3871{ 3920{
3872 struct sock *sk = skb->sk; 3921 struct sock *sk = skb->sk;
3873 struct sock_exterr_skb *serr; 3922 struct sock_exterr_skb *serr;
3874 int err; 3923 int err = 1;
3875 3924
3876 skb->wifi_acked_valid = 1; 3925 skb->wifi_acked_valid = 1;
3877 skb->wifi_acked = acked; 3926 skb->wifi_acked = acked;
@@ -3881,14 +3930,15 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
3881 serr->ee.ee_errno = ENOMSG; 3930 serr->ee.ee_errno = ENOMSG;
3882 serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; 3931 serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;
3883 3932
3884 /* take a reference to prevent skb_orphan() from freeing the socket */ 3933 /* Take a reference to prevent skb_orphan() from freeing the socket,
3885 sock_hold(sk); 3934 * but only if the socket refcount is not zero.
3886 3935 */
3887 err = sock_queue_err_skb(sk, skb); 3936 if (likely(atomic_inc_not_zero(&sk->sk_refcnt))) {
3937 err = sock_queue_err_skb(sk, skb);
3938 sock_put(sk);
3939 }
3888 if (err) 3940 if (err)
3889 kfree_skb(skb); 3941 kfree_skb(skb);
3890
3891 sock_put(sk);
3892} 3942}
3893EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); 3943EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
3894 3944
@@ -4350,7 +4400,7 @@ EXPORT_SYMBOL(skb_try_coalesce);
4350 */ 4400 */
4351void skb_scrub_packet(struct sk_buff *skb, bool xnet) 4401void skb_scrub_packet(struct sk_buff *skb, bool xnet)
4352{ 4402{
4353 skb->tstamp.tv64 = 0; 4403 skb->tstamp = 0;
4354 skb->pkt_type = PACKET_HOST; 4404 skb->pkt_type = PACKET_HOST;
4355 skb->skb_iif = 0; 4405 skb->skb_iif = 0;
4356 skb->ignore_df = 0; 4406 skb->ignore_df = 0;
@@ -4913,3 +4963,35 @@ struct sk_buff *pskb_extract(struct sk_buff *skb, int off,
4913 return clone; 4963 return clone;
4914} 4964}
4915EXPORT_SYMBOL(pskb_extract); 4965EXPORT_SYMBOL(pskb_extract);
4966
4967/**
4968 * skb_condense - try to get rid of fragments/frag_list if possible
4969 * @skb: buffer
4970 *
4971 * Can be used to save memory before skb is added to a busy queue.
4972 * If packet has bytes in frags and enough tail room in skb->head,
4973 * pull all of them, so that we can free the frags right now and adjust
4974 * truesize.
4975 * Notes:
4976 * We do not reallocate skb->head thus can not fail.
4977 * Caller must re-evaluate skb->truesize if needed.
4978 */
4979void skb_condense(struct sk_buff *skb)
4980{
4981 if (skb->data_len) {
4982 if (skb->data_len > skb->end - skb->tail ||
4983 skb_cloned(skb))
4984 return;
4985
4986 /* Nice, we can free page frag(s) right now */
4987 __pskb_pull_tail(skb, skb->data_len);
4988 }
4989 /* At this point, skb->truesize might be over estimated,
4990 * because skb had a fragment, and fragments do not tell
4991 * their truesize.
4992 * When we pulled its content into skb->head, fragment
4993 * was freed, but __pskb_pull_tail() could not possibly
4994 * adjust skb->truesize, not knowing the frag truesize.
4995 */
4996 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4997}