aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJohn Fastabend <john.fastabend@gmail.com>2018-05-14 13:00:16 -0400
committerDaniel Borkmann <daniel@iogearbox.net>2018-05-15 11:19:59 -0400
commite5cd3abcb31a48d4ea91bd32f0618802ca5f3592 (patch)
tree9f9a20ba785895a7484d60dad2c786e0cd4f04a7
parentf2467c2dbc019548052f3a64dc1efd01c0ae27aa (diff)
bpf: sockmap, refactor sockmap routines to work with hashmap
This patch only refactors the existing sockmap code. This will allow much of the psock initialization code path and bpf helper codes to work for both sockmap bpf map types that are backed by an array, the currently supported type, and the new hash backed bpf map type sockhash. Most the fallout comes from three changes, - Pushing bpf programs into an independent structure so we can use it from the htab struct in the next patch. - Generalizing helpers to use void *key instead of the hardcoded u32. - Instead of passing map/key through the metadata we now do the lookup inline. This avoids storing the key in the metadata which will be useful when keys can be longer than 4 bytes. We rename the sk pointers to sk_redir at this point as well to avoid any confusion between the current sk pointer and the redirect pointer sk_redir. Signed-off-by: John Fastabend <john.fastabend@gmail.com> Acked-by: David S. Miller <davem@davemloft.net> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
-rw-r--r--include/linux/filter.h3
-rw-r--r--include/net/tcp.h3
-rw-r--r--kernel/bpf/sockmap.c148
-rw-r--r--net/core/filter.c31
4 files changed, 98 insertions, 87 deletions
diff --git a/include/linux/filter.h b/include/linux/filter.h
index da7e16523128..9dbcb9d55921 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -515,9 +515,8 @@ struct sk_msg_buff {
515 int sg_end; 515 int sg_end;
516 struct scatterlist sg_data[MAX_SKB_FRAGS]; 516 struct scatterlist sg_data[MAX_SKB_FRAGS];
517 bool sg_copy[MAX_SKB_FRAGS]; 517 bool sg_copy[MAX_SKB_FRAGS];
518 __u32 key;
519 __u32 flags; 518 __u32 flags;
520 struct bpf_map *map; 519 struct sock *sk_redir;
521 struct sk_buff *skb; 520 struct sk_buff *skb;
522 struct list_head list; 521 struct list_head list;
523}; 522};
diff --git a/include/net/tcp.h b/include/net/tcp.h
index cf803fe0fb86..059287374ba0 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -814,9 +814,8 @@ struct tcp_skb_cb {
814#endif 814#endif
815 } header; /* For incoming skbs */ 815 } header; /* For incoming skbs */
816 struct { 816 struct {
817 __u32 key;
818 __u32 flags; 817 __u32 flags;
819 struct bpf_map *map; 818 struct sock *sk_redir;
820 void *data_end; 819 void *data_end;
821 } bpf; 820 } bpf;
822 }; 821 };
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 098eca568c2b..beab9ec9b023 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -48,14 +48,18 @@
48#define SOCK_CREATE_FLAG_MASK \ 48#define SOCK_CREATE_FLAG_MASK \
49 (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) 49 (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
50 50
51struct bpf_stab { 51struct bpf_sock_progs {
52 struct bpf_map map;
53 struct sock **sock_map;
54 struct bpf_prog *bpf_tx_msg; 52 struct bpf_prog *bpf_tx_msg;
55 struct bpf_prog *bpf_parse; 53 struct bpf_prog *bpf_parse;
56 struct bpf_prog *bpf_verdict; 54 struct bpf_prog *bpf_verdict;
57}; 55};
58 56
57struct bpf_stab {
58 struct bpf_map map;
59 struct sock **sock_map;
60 struct bpf_sock_progs progs;
61};
62
59enum smap_psock_state { 63enum smap_psock_state {
60 SMAP_TX_RUNNING, 64 SMAP_TX_RUNNING,
61}; 65};
@@ -461,7 +465,7 @@ static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md)
461static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md) 465static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md)
462{ 466{
463 return ((_rc == SK_PASS) ? 467 return ((_rc == SK_PASS) ?
464 (md->map ? __SK_REDIRECT : __SK_PASS) : 468 (md->sk_redir ? __SK_REDIRECT : __SK_PASS) :
465 __SK_DROP); 469 __SK_DROP);
466} 470}
467 471
@@ -1092,7 +1096,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
1092 * when we orphan the skb so that we don't have the possibility 1096 * when we orphan the skb so that we don't have the possibility
1093 * to reference a stale map. 1097 * to reference a stale map.
1094 */ 1098 */
1095 TCP_SKB_CB(skb)->bpf.map = NULL; 1099 TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
1096 skb->sk = psock->sock; 1100 skb->sk = psock->sock;
1097 bpf_compute_data_pointers(skb); 1101 bpf_compute_data_pointers(skb);
1098 preempt_disable(); 1102 preempt_disable();
@@ -1102,7 +1106,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
1102 1106
1103 /* Moving return codes from UAPI namespace into internal namespace */ 1107 /* Moving return codes from UAPI namespace into internal namespace */
1104 return rc == SK_PASS ? 1108 return rc == SK_PASS ?
1105 (TCP_SKB_CB(skb)->bpf.map ? __SK_REDIRECT : __SK_PASS) : 1109 (TCP_SKB_CB(skb)->bpf.sk_redir ? __SK_REDIRECT : __SK_PASS) :
1106 __SK_DROP; 1110 __SK_DROP;
1107} 1111}
1108 1112
@@ -1372,7 +1376,6 @@ static int smap_init_sock(struct smap_psock *psock,
1372} 1376}
1373 1377
1374static void smap_init_progs(struct smap_psock *psock, 1378static void smap_init_progs(struct smap_psock *psock,
1375 struct bpf_stab *stab,
1376 struct bpf_prog *verdict, 1379 struct bpf_prog *verdict,
1377 struct bpf_prog *parse) 1380 struct bpf_prog *parse)
1378{ 1381{
@@ -1450,14 +1453,13 @@ static void smap_gc_work(struct work_struct *w)
1450 kfree(psock); 1453 kfree(psock);
1451} 1454}
1452 1455
1453static struct smap_psock *smap_init_psock(struct sock *sock, 1456static struct smap_psock *smap_init_psock(struct sock *sock, int node)
1454 struct bpf_stab *stab)
1455{ 1457{
1456 struct smap_psock *psock; 1458 struct smap_psock *psock;
1457 1459
1458 psock = kzalloc_node(sizeof(struct smap_psock), 1460 psock = kzalloc_node(sizeof(struct smap_psock),
1459 GFP_ATOMIC | __GFP_NOWARN, 1461 GFP_ATOMIC | __GFP_NOWARN,
1460 stab->map.numa_node); 1462 node);
1461 if (!psock) 1463 if (!psock)
1462 return ERR_PTR(-ENOMEM); 1464 return ERR_PTR(-ENOMEM);
1463 1465
@@ -1662,40 +1664,26 @@ out:
1662 * - sock_map must use READ_ONCE and (cmp)xchg operations 1664 * - sock_map must use READ_ONCE and (cmp)xchg operations
1663 * - BPF verdict/parse programs must use READ_ONCE and xchg operations 1665 * - BPF verdict/parse programs must use READ_ONCE and xchg operations
1664 */ 1666 */
1665static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, 1667
1666 struct bpf_map *map, 1668static int __sock_map_ctx_update_elem(struct bpf_map *map,
1667 void *key, u64 flags) 1669 struct bpf_sock_progs *progs,
1670 struct sock *sock,
1671 struct sock **map_link,
1672 void *key)
1668{ 1673{
1669 struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
1670 struct smap_psock_map_entry *e = NULL;
1671 struct bpf_prog *verdict, *parse, *tx_msg; 1674 struct bpf_prog *verdict, *parse, *tx_msg;
1672 struct sock *osock, *sock; 1675 struct smap_psock_map_entry *e = NULL;
1673 struct smap_psock *psock; 1676 struct smap_psock *psock;
1674 u32 i = *(u32 *)key;
1675 bool new = false; 1677 bool new = false;
1676 int err; 1678 int err;
1677 1679
1678 if (unlikely(flags > BPF_EXIST))
1679 return -EINVAL;
1680
1681 if (unlikely(i >= stab->map.max_entries))
1682 return -E2BIG;
1683
1684 sock = READ_ONCE(stab->sock_map[i]);
1685 if (flags == BPF_EXIST && !sock)
1686 return -ENOENT;
1687 else if (flags == BPF_NOEXIST && sock)
1688 return -EEXIST;
1689
1690 sock = skops->sk;
1691
1692 /* 1. If sock map has BPF programs those will be inherited by the 1680 /* 1. If sock map has BPF programs those will be inherited by the
1693 * sock being added. If the sock is already attached to BPF programs 1681 * sock being added. If the sock is already attached to BPF programs
1694 * this results in an error. 1682 * this results in an error.
1695 */ 1683 */
1696 verdict = READ_ONCE(stab->bpf_verdict); 1684 verdict = READ_ONCE(progs->bpf_verdict);
1697 parse = READ_ONCE(stab->bpf_parse); 1685 parse = READ_ONCE(progs->bpf_parse);
1698 tx_msg = READ_ONCE(stab->bpf_tx_msg); 1686 tx_msg = READ_ONCE(progs->bpf_tx_msg);
1699 1687
1700 if (parse && verdict) { 1688 if (parse && verdict) {
1701 /* bpf prog refcnt may be zero if a concurrent attach operation 1689 /* bpf prog refcnt may be zero if a concurrent attach operation
@@ -1703,11 +1691,11 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
1703 * we increment the refcnt. If this is the case abort with an 1691 * we increment the refcnt. If this is the case abort with an
1704 * error. 1692 * error.
1705 */ 1693 */
1706 verdict = bpf_prog_inc_not_zero(stab->bpf_verdict); 1694 verdict = bpf_prog_inc_not_zero(progs->bpf_verdict);
1707 if (IS_ERR(verdict)) 1695 if (IS_ERR(verdict))
1708 return PTR_ERR(verdict); 1696 return PTR_ERR(verdict);
1709 1697
1710 parse = bpf_prog_inc_not_zero(stab->bpf_parse); 1698 parse = bpf_prog_inc_not_zero(progs->bpf_parse);
1711 if (IS_ERR(parse)) { 1699 if (IS_ERR(parse)) {
1712 bpf_prog_put(verdict); 1700 bpf_prog_put(verdict);
1713 return PTR_ERR(parse); 1701 return PTR_ERR(parse);
@@ -1715,7 +1703,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
1715 } 1703 }
1716 1704
1717 if (tx_msg) { 1705 if (tx_msg) {
1718 tx_msg = bpf_prog_inc_not_zero(stab->bpf_tx_msg); 1706 tx_msg = bpf_prog_inc_not_zero(progs->bpf_tx_msg);
1719 if (IS_ERR(tx_msg)) { 1707 if (IS_ERR(tx_msg)) {
1720 if (verdict) 1708 if (verdict)
1721 bpf_prog_put(verdict); 1709 bpf_prog_put(verdict);
@@ -1748,7 +1736,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
1748 goto out_progs; 1736 goto out_progs;
1749 } 1737 }
1750 } else { 1738 } else {
1751 psock = smap_init_psock(sock, stab); 1739 psock = smap_init_psock(sock, map->numa_node);
1752 if (IS_ERR(psock)) { 1740 if (IS_ERR(psock)) {
1753 err = PTR_ERR(psock); 1741 err = PTR_ERR(psock);
1754 goto out_progs; 1742 goto out_progs;
@@ -1763,7 +1751,6 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
1763 err = -ENOMEM; 1751 err = -ENOMEM;
1764 goto out_progs; 1752 goto out_progs;
1765 } 1753 }
1766 e->entry = &stab->sock_map[i];
1767 1754
1768 /* 3. At this point we have a reference to a valid psock that is 1755 /* 3. At this point we have a reference to a valid psock that is
1769 * running. Attach any BPF programs needed. 1756 * running. Attach any BPF programs needed.
@@ -1780,7 +1767,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
1780 err = smap_init_sock(psock, sock); 1767 err = smap_init_sock(psock, sock);
1781 if (err) 1768 if (err)
1782 goto out_free; 1769 goto out_free;
1783 smap_init_progs(psock, stab, verdict, parse); 1770 smap_init_progs(psock, verdict, parse);
1784 smap_start_sock(psock, sock); 1771 smap_start_sock(psock, sock);
1785 } 1772 }
1786 1773
@@ -1789,19 +1776,12 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
1789 * it with. Because we can only have a single set of programs if 1776 * it with. Because we can only have a single set of programs if
1790 * old_sock has a strp we can stop it. 1777 * old_sock has a strp we can stop it.
1791 */ 1778 */
1792 list_add_tail(&e->list, &psock->maps); 1779 if (map_link) {
1793 write_unlock_bh(&sock->sk_callback_lock); 1780 e->entry = map_link;
1794 1781 list_add_tail(&e->list, &psock->maps);
1795 osock = xchg(&stab->sock_map[i], sock);
1796 if (osock) {
1797 struct smap_psock *opsock = smap_psock_sk(osock);
1798
1799 write_lock_bh(&osock->sk_callback_lock);
1800 smap_list_remove(opsock, &stab->sock_map[i]);
1801 smap_release_sock(opsock, osock);
1802 write_unlock_bh(&osock->sk_callback_lock);
1803 } 1782 }
1804 return 0; 1783 write_unlock_bh(&sock->sk_callback_lock);
1784 return err;
1805out_free: 1785out_free:
1806 smap_release_sock(psock, sock); 1786 smap_release_sock(psock, sock);
1807out_progs: 1787out_progs:
@@ -1816,23 +1796,69 @@ out_progs:
1816 return err; 1796 return err;
1817} 1797}
1818 1798
1819int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) 1799static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
1800 struct bpf_map *map,
1801 void *key, u64 flags)
1820{ 1802{
1821 struct bpf_stab *stab = container_of(map, struct bpf_stab, map); 1803 struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
1804 struct bpf_sock_progs *progs = &stab->progs;
1805 struct sock *osock, *sock;
1806 u32 i = *(u32 *)key;
1807 int err;
1808
1809 if (unlikely(flags > BPF_EXIST))
1810 return -EINVAL;
1811
1812 if (unlikely(i >= stab->map.max_entries))
1813 return -E2BIG;
1814
1815 sock = READ_ONCE(stab->sock_map[i]);
1816 if (flags == BPF_EXIST && !sock)
1817 return -ENOENT;
1818 else if (flags == BPF_NOEXIST && sock)
1819 return -EEXIST;
1820
1821 sock = skops->sk;
1822 err = __sock_map_ctx_update_elem(map, progs, sock, &stab->sock_map[i],
1823 key);
1824 if (err)
1825 goto out;
1826
1827 osock = xchg(&stab->sock_map[i], sock);
1828 if (osock) {
1829 struct smap_psock *opsock = smap_psock_sk(osock);
1830
1831 write_lock_bh(&osock->sk_callback_lock);
1832 smap_list_remove(opsock, &stab->sock_map[i]);
1833 smap_release_sock(opsock, osock);
1834 write_unlock_bh(&osock->sk_callback_lock);
1835 }
1836out:
1837 return 0;
1838}
1839
1840int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
1841{
1842 struct bpf_sock_progs *progs;
1822 struct bpf_prog *orig; 1843 struct bpf_prog *orig;
1823 1844
1824 if (unlikely(map->map_type != BPF_MAP_TYPE_SOCKMAP)) 1845 if (map->map_type == BPF_MAP_TYPE_SOCKMAP) {
1846 struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
1847
1848 progs = &stab->progs;
1849 } else {
1825 return -EINVAL; 1850 return -EINVAL;
1851 }
1826 1852
1827 switch (type) { 1853 switch (type) {
1828 case BPF_SK_MSG_VERDICT: 1854 case BPF_SK_MSG_VERDICT:
1829 orig = xchg(&stab->bpf_tx_msg, prog); 1855 orig = xchg(&progs->bpf_tx_msg, prog);
1830 break; 1856 break;
1831 case BPF_SK_SKB_STREAM_PARSER: 1857 case BPF_SK_SKB_STREAM_PARSER:
1832 orig = xchg(&stab->bpf_parse, prog); 1858 orig = xchg(&progs->bpf_parse, prog);
1833 break; 1859 break;
1834 case BPF_SK_SKB_STREAM_VERDICT: 1860 case BPF_SK_SKB_STREAM_VERDICT:
1835 orig = xchg(&stab->bpf_verdict, prog); 1861 orig = xchg(&progs->bpf_verdict, prog);
1836 break; 1862 break;
1837 default: 1863 default:
1838 return -EOPNOTSUPP; 1864 return -EOPNOTSUPP;
@@ -1881,16 +1907,18 @@ static int sock_map_update_elem(struct bpf_map *map,
1881static void sock_map_release(struct bpf_map *map) 1907static void sock_map_release(struct bpf_map *map)
1882{ 1908{
1883 struct bpf_stab *stab = container_of(map, struct bpf_stab, map); 1909 struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
1910 struct bpf_sock_progs *progs;
1884 struct bpf_prog *orig; 1911 struct bpf_prog *orig;
1885 1912
1886 orig = xchg(&stab->bpf_parse, NULL); 1913 progs = &stab->progs;
1914 orig = xchg(&progs->bpf_parse, NULL);
1887 if (orig) 1915 if (orig)
1888 bpf_prog_put(orig); 1916 bpf_prog_put(orig);
1889 orig = xchg(&stab->bpf_verdict, NULL); 1917 orig = xchg(&progs->bpf_verdict, NULL);
1890 if (orig) 1918 if (orig)
1891 bpf_prog_put(orig); 1919 bpf_prog_put(orig);
1892 1920
1893 orig = xchg(&stab->bpf_tx_msg, NULL); 1921 orig = xchg(&progs->bpf_tx_msg, NULL);
1894 if (orig) 1922 if (orig)
1895 bpf_prog_put(orig); 1923 bpf_prog_put(orig);
1896} 1924}
diff --git a/net/core/filter.c b/net/core/filter.c
index ca60d2872da4..61a3ed6bac25 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2083,9 +2083,10 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
2083 if (unlikely(flags & ~(BPF_F_INGRESS))) 2083 if (unlikely(flags & ~(BPF_F_INGRESS)))
2084 return SK_DROP; 2084 return SK_DROP;
2085 2085
2086 tcb->bpf.key = key;
2087 tcb->bpf.flags = flags; 2086 tcb->bpf.flags = flags;
2088 tcb->bpf.map = map; 2087 tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key);
2088 if (!tcb->bpf.sk_redir)
2089 return SK_DROP;
2089 2090
2090 return SK_PASS; 2091 return SK_PASS;
2091} 2092}
@@ -2093,16 +2094,8 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
2093struct sock *do_sk_redirect_map(struct sk_buff *skb) 2094struct sock *do_sk_redirect_map(struct sk_buff *skb)
2094{ 2095{
2095 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); 2096 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
2096 struct sock *sk = NULL;
2097
2098 if (tcb->bpf.map) {
2099 sk = __sock_map_lookup_elem(tcb->bpf.map, tcb->bpf.key);
2100 2097
2101 tcb->bpf.key = 0; 2098 return tcb->bpf.sk_redir;
2102 tcb->bpf.map = NULL;
2103 }
2104
2105 return sk;
2106} 2099}
2107 2100
2108static const struct bpf_func_proto bpf_sk_redirect_map_proto = { 2101static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
@@ -2122,25 +2115,17 @@ BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg,
2122 if (unlikely(flags & ~(BPF_F_INGRESS))) 2115 if (unlikely(flags & ~(BPF_F_INGRESS)))
2123 return SK_DROP; 2116 return SK_DROP;
2124 2117
2125 msg->key = key;
2126 msg->flags = flags; 2118 msg->flags = flags;
2127 msg->map = map; 2119 msg->sk_redir = __sock_map_lookup_elem(map, key);
2120 if (!msg->sk_redir)
2121 return SK_DROP;
2128 2122
2129 return SK_PASS; 2123 return SK_PASS;
2130} 2124}
2131 2125
2132struct sock *do_msg_redirect_map(struct sk_msg_buff *msg) 2126struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
2133{ 2127{
2134 struct sock *sk = NULL; 2128 return msg->sk_redir;
2135
2136 if (msg->map) {
2137 sk = __sock_map_lookup_elem(msg->map, msg->key);
2138
2139 msg->key = 0;
2140 msg->map = NULL;
2141 }
2142
2143 return sk;
2144} 2129}
2145 2130
2146static const struct bpf_func_proto bpf_msg_redirect_map_proto = { 2131static const struct bpf_func_proto bpf_msg_redirect_map_proto = {