aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/arraymap.c18
-rw-r--r--kernel/bpf/cgroup.c94
-rw-r--r--kernel/bpf/core.c46
-rw-r--r--kernel/bpf/cpumap.c9
-rw-r--r--kernel/bpf/devmap.c14
-rw-r--r--kernel/bpf/hashtab.c14
-rw-r--r--kernel/bpf/local_storage.c13
-rw-r--r--kernel/bpf/lpm_trie.c8
-rw-r--r--kernel/bpf/queue_stack_maps.c13
-rw-r--r--kernel/bpf/reuseport_array.c17
-rw-r--r--kernel/bpf/stackmap.c28
-rw-r--r--kernel/bpf/syscall.c103
-rw-r--r--kernel/bpf/verifier.c397
-rw-r--r--kernel/bpf/xskmap.c10
-rw-r--r--kernel/cgroup/cgroup.c11
-rw-r--r--kernel/trace/bpf_trace.c96
16 files changed, 661 insertions, 230 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 584636c9e2eb..0349cbf23cdb 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -83,6 +83,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
83 u32 elem_size, index_mask, max_entries; 83 u32 elem_size, index_mask, max_entries;
84 bool unpriv = !capable(CAP_SYS_ADMIN); 84 bool unpriv = !capable(CAP_SYS_ADMIN);
85 u64 cost, array_size, mask64; 85 u64 cost, array_size, mask64;
86 struct bpf_map_memory mem;
86 struct bpf_array *array; 87 struct bpf_array *array;
87 88
88 elem_size = round_up(attr->value_size, 8); 89 elem_size = round_up(attr->value_size, 8);
@@ -116,32 +117,29 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
116 117
117 /* make sure there is no u32 overflow later in round_up() */ 118 /* make sure there is no u32 overflow later in round_up() */
118 cost = array_size; 119 cost = array_size;
119 if (cost >= U32_MAX - PAGE_SIZE) 120 if (percpu)
120 return ERR_PTR(-ENOMEM);
121 if (percpu) {
122 cost += (u64)attr->max_entries * elem_size * num_possible_cpus(); 121 cost += (u64)attr->max_entries * elem_size * num_possible_cpus();
123 if (cost >= U32_MAX - PAGE_SIZE)
124 return ERR_PTR(-ENOMEM);
125 }
126 cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
127 122
128 ret = bpf_map_precharge_memlock(cost); 123 ret = bpf_map_charge_init(&mem, cost);
129 if (ret < 0) 124 if (ret < 0)
130 return ERR_PTR(ret); 125 return ERR_PTR(ret);
131 126
132 /* allocate all map elements and zero-initialize them */ 127 /* allocate all map elements and zero-initialize them */
133 array = bpf_map_area_alloc(array_size, numa_node); 128 array = bpf_map_area_alloc(array_size, numa_node);
134 if (!array) 129 if (!array) {
130 bpf_map_charge_finish(&mem);
135 return ERR_PTR(-ENOMEM); 131 return ERR_PTR(-ENOMEM);
132 }
136 array->index_mask = index_mask; 133 array->index_mask = index_mask;
137 array->map.unpriv_array = unpriv; 134 array->map.unpriv_array = unpriv;
138 135
139 /* copy mandatory map attributes */ 136 /* copy mandatory map attributes */
140 bpf_map_init_from_attr(&array->map, attr); 137 bpf_map_init_from_attr(&array->map, attr);
141 array->map.pages = cost; 138 bpf_map_charge_move(&array->map.memory, &mem);
142 array->elem_size = elem_size; 139 array->elem_size = elem_size;
143 140
144 if (percpu && bpf_array_alloc_percpu(array)) { 141 if (percpu && bpf_array_alloc_percpu(array)) {
142 bpf_map_charge_finish(&array->map.memory);
145 bpf_map_area_free(array); 143 bpf_map_area_free(array);
146 return ERR_PTR(-ENOMEM); 144 return ERR_PTR(-ENOMEM);
147 } 145 }
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index fcde0f7b2585..1b65ab0df457 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -22,13 +22,23 @@
22DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); 22DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
23EXPORT_SYMBOL(cgroup_bpf_enabled_key); 23EXPORT_SYMBOL(cgroup_bpf_enabled_key);
24 24
25void cgroup_bpf_offline(struct cgroup *cgrp)
26{
27 cgroup_get(cgrp);
28 percpu_ref_kill(&cgrp->bpf.refcnt);
29}
30
25/** 31/**
26 * cgroup_bpf_put() - put references of all bpf programs 32 * cgroup_bpf_release() - put references of all bpf programs and
27 * @cgrp: the cgroup to modify 33 * release all cgroup bpf data
34 * @work: work structure embedded into the cgroup to modify
28 */ 35 */
29void cgroup_bpf_put(struct cgroup *cgrp) 36static void cgroup_bpf_release(struct work_struct *work)
30{ 37{
38 struct cgroup *cgrp = container_of(work, struct cgroup,
39 bpf.release_work);
31 enum bpf_cgroup_storage_type stype; 40 enum bpf_cgroup_storage_type stype;
41 struct bpf_prog_array *old_array;
32 unsigned int type; 42 unsigned int type;
33 43
34 for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { 44 for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
@@ -45,8 +55,27 @@ void cgroup_bpf_put(struct cgroup *cgrp)
45 kfree(pl); 55 kfree(pl);
46 static_branch_dec(&cgroup_bpf_enabled_key); 56 static_branch_dec(&cgroup_bpf_enabled_key);
47 } 57 }
48 bpf_prog_array_free(cgrp->bpf.effective[type]); 58 old_array = rcu_dereference_protected(
59 cgrp->bpf.effective[type],
60 percpu_ref_is_dying(&cgrp->bpf.refcnt));
61 bpf_prog_array_free(old_array);
49 } 62 }
63
64 percpu_ref_exit(&cgrp->bpf.refcnt);
65 cgroup_put(cgrp);
66}
67
68/**
69 * cgroup_bpf_release_fn() - callback used to schedule releasing
70 * of bpf cgroup data
71 * @ref: percpu ref counter structure
72 */
73static void cgroup_bpf_release_fn(struct percpu_ref *ref)
74{
75 struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
76
77 INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
78 queue_work(system_wq, &cgrp->bpf.release_work);
50} 79}
51 80
52/* count number of elements in the list. 81/* count number of elements in the list.
@@ -101,7 +130,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp,
101 */ 130 */
102static int compute_effective_progs(struct cgroup *cgrp, 131static int compute_effective_progs(struct cgroup *cgrp,
103 enum bpf_attach_type type, 132 enum bpf_attach_type type,
104 struct bpf_prog_array __rcu **array) 133 struct bpf_prog_array **array)
105{ 134{
106 enum bpf_cgroup_storage_type stype; 135 enum bpf_cgroup_storage_type stype;
107 struct bpf_prog_array *progs; 136 struct bpf_prog_array *progs;
@@ -139,17 +168,16 @@ static int compute_effective_progs(struct cgroup *cgrp,
139 } 168 }
140 } while ((p = cgroup_parent(p))); 169 } while ((p = cgroup_parent(p)));
141 170
142 rcu_assign_pointer(*array, progs); 171 *array = progs;
143 return 0; 172 return 0;
144} 173}
145 174
146static void activate_effective_progs(struct cgroup *cgrp, 175static void activate_effective_progs(struct cgroup *cgrp,
147 enum bpf_attach_type type, 176 enum bpf_attach_type type,
148 struct bpf_prog_array __rcu *array) 177 struct bpf_prog_array *old_array)
149{ 178{
150 struct bpf_prog_array __rcu *old_array; 179 rcu_swap_protected(cgrp->bpf.effective[type], old_array,
151 180 lockdep_is_held(&cgroup_mutex));
152 old_array = xchg(&cgrp->bpf.effective[type], array);
153 /* free prog array after grace period, since __cgroup_bpf_run_*() 181 /* free prog array after grace period, since __cgroup_bpf_run_*()
154 * might be still walking the array 182 * might be still walking the array
155 */ 183 */
@@ -166,8 +194,13 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
166 * that array below is variable length 194 * that array below is variable length
167 */ 195 */
168#define NR ARRAY_SIZE(cgrp->bpf.effective) 196#define NR ARRAY_SIZE(cgrp->bpf.effective)
169 struct bpf_prog_array __rcu *arrays[NR] = {}; 197 struct bpf_prog_array *arrays[NR] = {};
170 int i; 198 int ret, i;
199
200 ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
201 GFP_KERNEL);
202 if (ret)
203 return ret;
171 204
172 for (i = 0; i < NR; i++) 205 for (i = 0; i < NR; i++)
173 INIT_LIST_HEAD(&cgrp->bpf.progs[i]); 206 INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
@@ -183,6 +216,9 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
183cleanup: 216cleanup:
184 for (i = 0; i < NR; i++) 217 for (i = 0; i < NR; i++)
185 bpf_prog_array_free(arrays[i]); 218 bpf_prog_array_free(arrays[i]);
219
220 percpu_ref_exit(&cgrp->bpf.refcnt);
221
186 return -ENOMEM; 222 return -ENOMEM;
187} 223}
188 224
@@ -444,10 +480,14 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
444 enum bpf_attach_type type = attr->query.attach_type; 480 enum bpf_attach_type type = attr->query.attach_type;
445 struct list_head *progs = &cgrp->bpf.progs[type]; 481 struct list_head *progs = &cgrp->bpf.progs[type];
446 u32 flags = cgrp->bpf.flags[type]; 482 u32 flags = cgrp->bpf.flags[type];
483 struct bpf_prog_array *effective;
447 int cnt, ret = 0, i; 484 int cnt, ret = 0, i;
448 485
486 effective = rcu_dereference_protected(cgrp->bpf.effective[type],
487 lockdep_is_held(&cgroup_mutex));
488
449 if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) 489 if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
450 cnt = bpf_prog_array_length(cgrp->bpf.effective[type]); 490 cnt = bpf_prog_array_length(effective);
451 else 491 else
452 cnt = prog_list_length(progs); 492 cnt = prog_list_length(progs);
453 493
@@ -464,8 +504,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
464 } 504 }
465 505
466 if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { 506 if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
467 return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type], 507 return bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
468 prog_ids, cnt);
469 } else { 508 } else {
470 struct bpf_prog_list *pl; 509 struct bpf_prog_list *pl;
471 u32 id; 510 u32 id;
@@ -548,8 +587,16 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
548 * The program type passed in via @type must be suitable for network 587 * The program type passed in via @type must be suitable for network
549 * filtering. No further check is performed to assert that. 588 * filtering. No further check is performed to assert that.
550 * 589 *
551 * This function will return %-EPERM if any if an attached program was found 590 * For egress packets, this function can return:
552 * and if it returned != 1 during execution. In all other cases, 0 is returned. 591 * NET_XMIT_SUCCESS (0) - continue with packet output
592 * NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr
593 * NET_XMIT_CN (2) - continue with packet output and notify TCP
594 * to call cwr
595 * -EPERM - drop packet
596 *
597 * For ingress packets, this function will return -EPERM if any
598 * attached program was found and if it returned != 1 during execution.
599 * Otherwise 0 is returned.
553 */ 600 */
554int __cgroup_bpf_run_filter_skb(struct sock *sk, 601int __cgroup_bpf_run_filter_skb(struct sock *sk,
555 struct sk_buff *skb, 602 struct sk_buff *skb,
@@ -575,12 +622,19 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
575 /* compute pointers for the bpf prog */ 622 /* compute pointers for the bpf prog */
576 bpf_compute_and_save_data_end(skb, &saved_data_end); 623 bpf_compute_and_save_data_end(skb, &saved_data_end);
577 624
578 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, 625 if (type == BPF_CGROUP_INET_EGRESS) {
579 __bpf_prog_run_save_cb); 626 ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(
627 cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb);
628 } else {
629 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
630 __bpf_prog_run_save_cb);
631 ret = (ret == 1 ? 0 : -EPERM);
632 }
580 bpf_restore_data_end(skb, saved_data_end); 633 bpf_restore_data_end(skb, saved_data_end);
581 __skb_pull(skb, offset); 634 __skb_pull(skb, offset);
582 skb->sk = save_sk; 635 skb->sk = save_sk;
583 return ret == 1 ? 0 : -EPERM; 636
637 return ret;
584} 638}
585EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); 639EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
586 640
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 242a643af82f..33fb292f2e30 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1795,38 +1795,33 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
1795 return &empty_prog_array.hdr; 1795 return &empty_prog_array.hdr;
1796} 1796}
1797 1797
1798void bpf_prog_array_free(struct bpf_prog_array __rcu *progs) 1798void bpf_prog_array_free(struct bpf_prog_array *progs)
1799{ 1799{
1800 if (!progs || 1800 if (!progs || progs == &empty_prog_array.hdr)
1801 progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr)
1802 return; 1801 return;
1803 kfree_rcu(progs, rcu); 1802 kfree_rcu(progs, rcu);
1804} 1803}
1805 1804
1806int bpf_prog_array_length(struct bpf_prog_array __rcu *array) 1805int bpf_prog_array_length(struct bpf_prog_array *array)
1807{ 1806{
1808 struct bpf_prog_array_item *item; 1807 struct bpf_prog_array_item *item;
1809 u32 cnt = 0; 1808 u32 cnt = 0;
1810 1809
1811 rcu_read_lock(); 1810 for (item = array->items; item->prog; item++)
1812 item = rcu_dereference(array)->items;
1813 for (; item->prog; item++)
1814 if (item->prog != &dummy_bpf_prog.prog) 1811 if (item->prog != &dummy_bpf_prog.prog)
1815 cnt++; 1812 cnt++;
1816 rcu_read_unlock();
1817 return cnt; 1813 return cnt;
1818} 1814}
1819 1815
1820 1816
1821static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array, 1817static bool bpf_prog_array_copy_core(struct bpf_prog_array *array,
1822 u32 *prog_ids, 1818 u32 *prog_ids,
1823 u32 request_cnt) 1819 u32 request_cnt)
1824{ 1820{
1825 struct bpf_prog_array_item *item; 1821 struct bpf_prog_array_item *item;
1826 int i = 0; 1822 int i = 0;
1827 1823
1828 item = rcu_dereference_check(array, 1)->items; 1824 for (item = array->items; item->prog; item++) {
1829 for (; item->prog; item++) {
1830 if (item->prog == &dummy_bpf_prog.prog) 1825 if (item->prog == &dummy_bpf_prog.prog)
1831 continue; 1826 continue;
1832 prog_ids[i] = item->prog->aux->id; 1827 prog_ids[i] = item->prog->aux->id;
@@ -1839,7 +1834,7 @@ static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array,
1839 return !!(item->prog); 1834 return !!(item->prog);
1840} 1835}
1841 1836
1842int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array, 1837int bpf_prog_array_copy_to_user(struct bpf_prog_array *array,
1843 __u32 __user *prog_ids, u32 cnt) 1838 __u32 __user *prog_ids, u32 cnt)
1844{ 1839{
1845 unsigned long err = 0; 1840 unsigned long err = 0;
@@ -1850,18 +1845,12 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array,
1850 * cnt = bpf_prog_array_length(); 1845 * cnt = bpf_prog_array_length();
1851 * if (cnt > 0) 1846 * if (cnt > 0)
1852 * bpf_prog_array_copy_to_user(..., cnt); 1847 * bpf_prog_array_copy_to_user(..., cnt);
1853 * so below kcalloc doesn't need extra cnt > 0 check, but 1848 * so below kcalloc doesn't need extra cnt > 0 check.
1854 * bpf_prog_array_length() releases rcu lock and
1855 * prog array could have been swapped with empty or larger array,
1856 * so always copy 'cnt' prog_ids to the user.
1857 * In a rare race the user will see zero prog_ids
1858 */ 1849 */
1859 ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN); 1850 ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN);
1860 if (!ids) 1851 if (!ids)
1861 return -ENOMEM; 1852 return -ENOMEM;
1862 rcu_read_lock();
1863 nospc = bpf_prog_array_copy_core(array, ids, cnt); 1853 nospc = bpf_prog_array_copy_core(array, ids, cnt);
1864 rcu_read_unlock();
1865 err = copy_to_user(prog_ids, ids, cnt * sizeof(u32)); 1854 err = copy_to_user(prog_ids, ids, cnt * sizeof(u32));
1866 kfree(ids); 1855 kfree(ids);
1867 if (err) 1856 if (err)
@@ -1871,19 +1860,19 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array,
1871 return 0; 1860 return 0;
1872} 1861}
1873 1862
1874void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *array, 1863void bpf_prog_array_delete_safe(struct bpf_prog_array *array,
1875 struct bpf_prog *old_prog) 1864 struct bpf_prog *old_prog)
1876{ 1865{
1877 struct bpf_prog_array_item *item = array->items; 1866 struct bpf_prog_array_item *item;
1878 1867
1879 for (; item->prog; item++) 1868 for (item = array->items; item->prog; item++)
1880 if (item->prog == old_prog) { 1869 if (item->prog == old_prog) {
1881 WRITE_ONCE(item->prog, &dummy_bpf_prog.prog); 1870 WRITE_ONCE(item->prog, &dummy_bpf_prog.prog);
1882 break; 1871 break;
1883 } 1872 }
1884} 1873}
1885 1874
1886int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, 1875int bpf_prog_array_copy(struct bpf_prog_array *old_array,
1887 struct bpf_prog *exclude_prog, 1876 struct bpf_prog *exclude_prog,
1888 struct bpf_prog *include_prog, 1877 struct bpf_prog *include_prog,
1889 struct bpf_prog_array **new_array) 1878 struct bpf_prog_array **new_array)
@@ -1947,7 +1936,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
1947 return 0; 1936 return 0;
1948} 1937}
1949 1938
1950int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, 1939int bpf_prog_array_copy_info(struct bpf_prog_array *array,
1951 u32 *prog_ids, u32 request_cnt, 1940 u32 *prog_ids, u32 request_cnt,
1952 u32 *prog_cnt) 1941 u32 *prog_cnt)
1953{ 1942{
@@ -2090,6 +2079,15 @@ bool __weak bpf_helper_changes_pkt_data(void *func)
2090 return false; 2079 return false;
2091} 2080}
2092 2081
2082/* Return TRUE if the JIT backend wants verifier to enable sub-register usage
2083 * analysis code and wants explicit zero extension inserted by verifier.
2084 * Otherwise, return FALSE.
2085 */
2086bool __weak bpf_jit_needs_zext(void)
2087{
2088 return false;
2089}
2090
2093/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call 2091/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
2094 * skb_copy_bits(), so provide a weak definition of it for NET-less config. 2092 * skb_copy_bits(), so provide a weak definition of it for NET-less config.
2095 */ 2093 */
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index cf727d77c6c6..b31a71909307 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -106,12 +106,9 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
106 /* make sure page count doesn't overflow */ 106 /* make sure page count doesn't overflow */
107 cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); 107 cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
108 cost += cpu_map_bitmap_size(attr) * num_possible_cpus(); 108 cost += cpu_map_bitmap_size(attr) * num_possible_cpus();
109 if (cost >= U32_MAX - PAGE_SIZE)
110 goto free_cmap;
111 cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
112 109
113 /* Notice returns -EPERM on if map size is larger than memlock limit */ 110 /* Notice returns -EPERM on if map size is larger than memlock limit */
114 ret = bpf_map_precharge_memlock(cmap->map.pages); 111 ret = bpf_map_charge_init(&cmap->map.memory, cost);
115 if (ret) { 112 if (ret) {
116 err = ret; 113 err = ret;
117 goto free_cmap; 114 goto free_cmap;
@@ -121,7 +118,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
121 cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr), 118 cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr),
122 __alignof__(unsigned long)); 119 __alignof__(unsigned long));
123 if (!cmap->flush_needed) 120 if (!cmap->flush_needed)
124 goto free_cmap; 121 goto free_charge;
125 122
126 /* Alloc array for possible remote "destination" CPUs */ 123 /* Alloc array for possible remote "destination" CPUs */
127 cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * 124 cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
@@ -133,6 +130,8 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
133 return &cmap->map; 130 return &cmap->map;
134free_percpu: 131free_percpu:
135 free_percpu(cmap->flush_needed); 132 free_percpu(cmap->flush_needed);
133free_charge:
134 bpf_map_charge_finish(&cmap->map.memory);
136free_cmap: 135free_cmap:
137 kfree(cmap); 136 kfree(cmap);
138 return ERR_PTR(err); 137 return ERR_PTR(err);
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 1e525d70f833..5ae7cce5ef16 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -108,13 +108,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
108 /* make sure page count doesn't overflow */ 108 /* make sure page count doesn't overflow */
109 cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); 109 cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
110 cost += dev_map_bitmap_size(attr) * num_possible_cpus(); 110 cost += dev_map_bitmap_size(attr) * num_possible_cpus();
111 if (cost >= U32_MAX - PAGE_SIZE)
112 goto free_dtab;
113
114 dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
115 111
116 /* if map size is larger than memlock limit, reject it early */ 112 /* if map size is larger than memlock limit, reject it */
117 err = bpf_map_precharge_memlock(dtab->map.pages); 113 err = bpf_map_charge_init(&dtab->map.memory, cost);
118 if (err) 114 if (err)
119 goto free_dtab; 115 goto free_dtab;
120 116
@@ -125,19 +121,21 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
125 __alignof__(unsigned long), 121 __alignof__(unsigned long),
126 GFP_KERNEL | __GFP_NOWARN); 122 GFP_KERNEL | __GFP_NOWARN);
127 if (!dtab->flush_needed) 123 if (!dtab->flush_needed)
128 goto free_dtab; 124 goto free_charge;
129 125
130 dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * 126 dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
131 sizeof(struct bpf_dtab_netdev *), 127 sizeof(struct bpf_dtab_netdev *),
132 dtab->map.numa_node); 128 dtab->map.numa_node);
133 if (!dtab->netdev_map) 129 if (!dtab->netdev_map)
134 goto free_dtab; 130 goto free_charge;
135 131
136 spin_lock(&dev_map_lock); 132 spin_lock(&dev_map_lock);
137 list_add_tail_rcu(&dtab->list, &dev_map_list); 133 list_add_tail_rcu(&dtab->list, &dev_map_list);
138 spin_unlock(&dev_map_lock); 134 spin_unlock(&dev_map_lock);
139 135
140 return &dtab->map; 136 return &dtab->map;
137free_charge:
138 bpf_map_charge_finish(&dtab->map.memory);
141free_dtab: 139free_dtab:
142 free_percpu(dtab->flush_needed); 140 free_percpu(dtab->flush_needed);
143 kfree(dtab); 141 kfree(dtab);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 0f2708fde5f7..d92e05d9979b 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -360,14 +360,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
360 else 360 else
361 cost += (u64) htab->elem_size * num_possible_cpus(); 361 cost += (u64) htab->elem_size * num_possible_cpus();
362 362
363 if (cost >= U32_MAX - PAGE_SIZE) 363 /* if map size is larger than memlock limit, reject it */
364 /* make sure page count doesn't overflow */ 364 err = bpf_map_charge_init(&htab->map.memory, cost);
365 goto free_htab;
366
367 htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
368
369 /* if map size is larger than memlock limit, reject it early */
370 err = bpf_map_precharge_memlock(htab->map.pages);
371 if (err) 365 if (err)
372 goto free_htab; 366 goto free_htab;
373 367
@@ -376,7 +370,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
376 sizeof(struct bucket), 370 sizeof(struct bucket),
377 htab->map.numa_node); 371 htab->map.numa_node);
378 if (!htab->buckets) 372 if (!htab->buckets)
379 goto free_htab; 373 goto free_charge;
380 374
381 if (htab->map.map_flags & BPF_F_ZERO_SEED) 375 if (htab->map.map_flags & BPF_F_ZERO_SEED)
382 htab->hashrnd = 0; 376 htab->hashrnd = 0;
@@ -409,6 +403,8 @@ free_prealloc:
409 prealloc_destroy(htab); 403 prealloc_destroy(htab);
410free_buckets: 404free_buckets:
411 bpf_map_area_free(htab->buckets); 405 bpf_map_area_free(htab->buckets);
406free_charge:
407 bpf_map_charge_finish(&htab->map.memory);
412free_htab: 408free_htab:
413 kfree(htab); 409 kfree(htab);
414 return ERR_PTR(err); 410 return ERR_PTR(err);
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 980e8f1f6cb5..addd6fdceec8 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -272,6 +272,8 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
272{ 272{
273 int numa_node = bpf_map_attr_numa_node(attr); 273 int numa_node = bpf_map_attr_numa_node(attr);
274 struct bpf_cgroup_storage_map *map; 274 struct bpf_cgroup_storage_map *map;
275 struct bpf_map_memory mem;
276 int ret;
275 277
276 if (attr->key_size != sizeof(struct bpf_cgroup_storage_key)) 278 if (attr->key_size != sizeof(struct bpf_cgroup_storage_key))
277 return ERR_PTR(-EINVAL); 279 return ERR_PTR(-EINVAL);
@@ -290,13 +292,18 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
290 /* max_entries is not used and enforced to be 0 */ 292 /* max_entries is not used and enforced to be 0 */
291 return ERR_PTR(-EINVAL); 293 return ERR_PTR(-EINVAL);
292 294
295 ret = bpf_map_charge_init(&mem, sizeof(struct bpf_cgroup_storage_map));
296 if (ret < 0)
297 return ERR_PTR(ret);
298
293 map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), 299 map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map),
294 __GFP_ZERO | GFP_USER, numa_node); 300 __GFP_ZERO | GFP_USER, numa_node);
295 if (!map) 301 if (!map) {
302 bpf_map_charge_finish(&mem);
296 return ERR_PTR(-ENOMEM); 303 return ERR_PTR(-ENOMEM);
304 }
297 305
298 map->map.pages = round_up(sizeof(struct bpf_cgroup_storage_map), 306 bpf_map_charge_move(&map->map.memory, &mem);
299 PAGE_SIZE) >> PAGE_SHIFT;
300 307
301 /* copy mandatory map attributes */ 308 /* copy mandatory map attributes */
302 bpf_map_init_from_attr(&map->map, attr); 309 bpf_map_init_from_attr(&map->map, attr);
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index e61630c2e50b..09334f13a8a0 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -573,14 +573,8 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
573 cost_per_node = sizeof(struct lpm_trie_node) + 573 cost_per_node = sizeof(struct lpm_trie_node) +
574 attr->value_size + trie->data_size; 574 attr->value_size + trie->data_size;
575 cost += (u64) attr->max_entries * cost_per_node; 575 cost += (u64) attr->max_entries * cost_per_node;
576 if (cost >= U32_MAX - PAGE_SIZE) {
577 ret = -E2BIG;
578 goto out_err;
579 }
580
581 trie->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
582 576
583 ret = bpf_map_precharge_memlock(trie->map.pages); 577 ret = bpf_map_charge_init(&trie->map.memory, cost);
584 if (ret) 578 if (ret)
585 goto out_err; 579 goto out_err;
586 580
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index 0b140d236889..f697647ceb54 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -67,29 +67,28 @@ static int queue_stack_map_alloc_check(union bpf_attr *attr)
67static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) 67static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
68{ 68{
69 int ret, numa_node = bpf_map_attr_numa_node(attr); 69 int ret, numa_node = bpf_map_attr_numa_node(attr);
70 struct bpf_map_memory mem = {0};
70 struct bpf_queue_stack *qs; 71 struct bpf_queue_stack *qs;
71 u64 size, queue_size, cost; 72 u64 size, queue_size, cost;
72 73
73 size = (u64) attr->max_entries + 1; 74 size = (u64) attr->max_entries + 1;
74 cost = queue_size = sizeof(*qs) + size * attr->value_size; 75 cost = queue_size = sizeof(*qs) + size * attr->value_size;
75 if (cost >= U32_MAX - PAGE_SIZE)
76 return ERR_PTR(-E2BIG);
77 76
78 cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; 77 ret = bpf_map_charge_init(&mem, cost);
79
80 ret = bpf_map_precharge_memlock(cost);
81 if (ret < 0) 78 if (ret < 0)
82 return ERR_PTR(ret); 79 return ERR_PTR(ret);
83 80
84 qs = bpf_map_area_alloc(queue_size, numa_node); 81 qs = bpf_map_area_alloc(queue_size, numa_node);
85 if (!qs) 82 if (!qs) {
83 bpf_map_charge_finish(&mem);
86 return ERR_PTR(-ENOMEM); 84 return ERR_PTR(-ENOMEM);
85 }
87 86
88 memset(qs, 0, sizeof(*qs)); 87 memset(qs, 0, sizeof(*qs));
89 88
90 bpf_map_init_from_attr(&qs->map, attr); 89 bpf_map_init_from_attr(&qs->map, attr);
91 90
92 qs->map.pages = cost; 91 bpf_map_charge_move(&qs->map.memory, &mem);
93 qs->size = size; 92 qs->size = size;
94 93
95 raw_spin_lock_init(&qs->lock); 94 raw_spin_lock_init(&qs->lock);
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 18e225de80ff..50c083ba978c 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -151,7 +151,8 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
151{ 151{
152 int err, numa_node = bpf_map_attr_numa_node(attr); 152 int err, numa_node = bpf_map_attr_numa_node(attr);
153 struct reuseport_array *array; 153 struct reuseport_array *array;
154 u64 cost, array_size; 154 struct bpf_map_memory mem;
155 u64 array_size;
155 156
156 if (!capable(CAP_SYS_ADMIN)) 157 if (!capable(CAP_SYS_ADMIN))
157 return ERR_PTR(-EPERM); 158 return ERR_PTR(-EPERM);
@@ -159,24 +160,20 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
159 array_size = sizeof(*array); 160 array_size = sizeof(*array);
160 array_size += (u64)attr->max_entries * sizeof(struct sock *); 161 array_size += (u64)attr->max_entries * sizeof(struct sock *);
161 162
162 /* make sure there is no u32 overflow later in round_up() */ 163 err = bpf_map_charge_init(&mem, array_size);
163 cost = array_size;
164 if (cost >= U32_MAX - PAGE_SIZE)
165 return ERR_PTR(-ENOMEM);
166 cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
167
168 err = bpf_map_precharge_memlock(cost);
169 if (err) 164 if (err)
170 return ERR_PTR(err); 165 return ERR_PTR(err);
171 166
172 /* allocate all map elements and zero-initialize them */ 167 /* allocate all map elements and zero-initialize them */
173 array = bpf_map_area_alloc(array_size, numa_node); 168 array = bpf_map_area_alloc(array_size, numa_node);
174 if (!array) 169 if (!array) {
170 bpf_map_charge_finish(&mem);
175 return ERR_PTR(-ENOMEM); 171 return ERR_PTR(-ENOMEM);
172 }
176 173
177 /* copy mandatory map attributes */ 174 /* copy mandatory map attributes */
178 bpf_map_init_from_attr(&array->map, attr); 175 bpf_map_init_from_attr(&array->map, attr);
179 array->map.pages = cost; 176 bpf_map_charge_move(&array->map.memory, &mem);
180 177
181 return &array->map; 178 return &array->map;
182} 179}
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 950ab2f28922..3d86072d8e32 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -89,6 +89,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
89{ 89{
90 u32 value_size = attr->value_size; 90 u32 value_size = attr->value_size;
91 struct bpf_stack_map *smap; 91 struct bpf_stack_map *smap;
92 struct bpf_map_memory mem;
92 u64 cost, n_buckets; 93 u64 cost, n_buckets;
93 int err; 94 int err;
94 95
@@ -116,40 +117,37 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
116 n_buckets = roundup_pow_of_two(attr->max_entries); 117 n_buckets = roundup_pow_of_two(attr->max_entries);
117 118
118 cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); 119 cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
119 if (cost >= U32_MAX - PAGE_SIZE) 120 cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
120 return ERR_PTR(-E2BIG); 121 err = bpf_map_charge_init(&mem, cost);
122 if (err)
123 return ERR_PTR(err);
121 124
122 smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); 125 smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
123 if (!smap) 126 if (!smap) {
127 bpf_map_charge_finish(&mem);
124 return ERR_PTR(-ENOMEM); 128 return ERR_PTR(-ENOMEM);
125 129 }
126 err = -E2BIG;
127 cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
128 if (cost >= U32_MAX - PAGE_SIZE)
129 goto free_smap;
130 130
131 bpf_map_init_from_attr(&smap->map, attr); 131 bpf_map_init_from_attr(&smap->map, attr);
132 smap->map.value_size = value_size; 132 smap->map.value_size = value_size;
133 smap->n_buckets = n_buckets; 133 smap->n_buckets = n_buckets;
134 smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
135
136 err = bpf_map_precharge_memlock(smap->map.pages);
137 if (err)
138 goto free_smap;
139 134
140 err = get_callchain_buffers(sysctl_perf_event_max_stack); 135 err = get_callchain_buffers(sysctl_perf_event_max_stack);
141 if (err) 136 if (err)
142 goto free_smap; 137 goto free_charge;
143 138
144 err = prealloc_elems_and_freelist(smap); 139 err = prealloc_elems_and_freelist(smap);
145 if (err) 140 if (err)
146 goto put_buffers; 141 goto put_buffers;
147 142
143 bpf_map_charge_move(&smap->map.memory, &mem);
144
148 return &smap->map; 145 return &smap->map;
149 146
150put_buffers: 147put_buffers:
151 put_callchain_buffers(); 148 put_callchain_buffers();
152free_smap: 149free_charge:
150 bpf_map_charge_finish(&mem);
153 bpf_map_area_free(smap); 151 bpf_map_area_free(smap);
154 return ERR_PTR(err); 152 return ERR_PTR(err);
155} 153}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cb5440b02e82..4c53cbd3329d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -188,19 +188,6 @@ void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
188 map->numa_node = bpf_map_attr_numa_node(attr); 188 map->numa_node = bpf_map_attr_numa_node(attr);
189} 189}
190 190
191int bpf_map_precharge_memlock(u32 pages)
192{
193 struct user_struct *user = get_current_user();
194 unsigned long memlock_limit, cur;
195
196 memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
197 cur = atomic_long_read(&user->locked_vm);
198 free_uid(user);
199 if (cur + pages > memlock_limit)
200 return -EPERM;
201 return 0;
202}
203
204static int bpf_charge_memlock(struct user_struct *user, u32 pages) 191static int bpf_charge_memlock(struct user_struct *user, u32 pages)
205{ 192{
206 unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 193 unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
@@ -214,45 +201,62 @@ static int bpf_charge_memlock(struct user_struct *user, u32 pages)
214 201
215static void bpf_uncharge_memlock(struct user_struct *user, u32 pages) 202static void bpf_uncharge_memlock(struct user_struct *user, u32 pages)
216{ 203{
217 atomic_long_sub(pages, &user->locked_vm); 204 if (user)
205 atomic_long_sub(pages, &user->locked_vm);
218} 206}
219 207
220static int bpf_map_init_memlock(struct bpf_map *map) 208int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size)
221{ 209{
222 struct user_struct *user = get_current_user(); 210 u32 pages = round_up(size, PAGE_SIZE) >> PAGE_SHIFT;
211 struct user_struct *user;
223 int ret; 212 int ret;
224 213
225 ret = bpf_charge_memlock(user, map->pages); 214 if (size >= U32_MAX - PAGE_SIZE)
215 return -E2BIG;
216
217 user = get_current_user();
218 ret = bpf_charge_memlock(user, pages);
226 if (ret) { 219 if (ret) {
227 free_uid(user); 220 free_uid(user);
228 return ret; 221 return ret;
229 } 222 }
230 map->user = user; 223
231 return ret; 224 mem->pages = pages;
225 mem->user = user;
226
227 return 0;
232} 228}
233 229
234static void bpf_map_release_memlock(struct bpf_map *map) 230void bpf_map_charge_finish(struct bpf_map_memory *mem)
235{ 231{
236 struct user_struct *user = map->user; 232 bpf_uncharge_memlock(mem->user, mem->pages);
237 bpf_uncharge_memlock(user, map->pages); 233 free_uid(mem->user);
238 free_uid(user); 234}
235
236void bpf_map_charge_move(struct bpf_map_memory *dst,
237 struct bpf_map_memory *src)
238{
239 *dst = *src;
240
241 /* Make sure src will not be used for the redundant uncharging. */
242 memset(src, 0, sizeof(struct bpf_map_memory));
239} 243}
240 244
241int bpf_map_charge_memlock(struct bpf_map *map, u32 pages) 245int bpf_map_charge_memlock(struct bpf_map *map, u32 pages)
242{ 246{
243 int ret; 247 int ret;
244 248
245 ret = bpf_charge_memlock(map->user, pages); 249 ret = bpf_charge_memlock(map->memory.user, pages);
246 if (ret) 250 if (ret)
247 return ret; 251 return ret;
248 map->pages += pages; 252 map->memory.pages += pages;
249 return ret; 253 return ret;
250} 254}
251 255
252void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages) 256void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages)
253{ 257{
254 bpf_uncharge_memlock(map->user, pages); 258 bpf_uncharge_memlock(map->memory.user, pages);
255 map->pages -= pages; 259 map->memory.pages -= pages;
256} 260}
257 261
258static int bpf_map_alloc_id(struct bpf_map *map) 262static int bpf_map_alloc_id(struct bpf_map *map)
@@ -303,11 +307,13 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
303static void bpf_map_free_deferred(struct work_struct *work) 307static void bpf_map_free_deferred(struct work_struct *work)
304{ 308{
305 struct bpf_map *map = container_of(work, struct bpf_map, work); 309 struct bpf_map *map = container_of(work, struct bpf_map, work);
310 struct bpf_map_memory mem;
306 311
307 bpf_map_release_memlock(map); 312 bpf_map_charge_move(&mem, &map->memory);
308 security_bpf_map_free(map); 313 security_bpf_map_free(map);
309 /* implementation dependent freeing */ 314 /* implementation dependent freeing */
310 map->ops->map_free(map); 315 map->ops->map_free(map);
316 bpf_map_charge_finish(&mem);
311} 317}
312 318
313static void bpf_map_put_uref(struct bpf_map *map) 319static void bpf_map_put_uref(struct bpf_map *map)
@@ -395,7 +401,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
395 map->value_size, 401 map->value_size,
396 map->max_entries, 402 map->max_entries,
397 map->map_flags, 403 map->map_flags,
398 map->pages * 1ULL << PAGE_SHIFT, 404 map->memory.pages * 1ULL << PAGE_SHIFT,
399 map->id, 405 map->id,
400 READ_ONCE(map->frozen)); 406 READ_ONCE(map->frozen));
401 407
@@ -549,6 +555,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
549static int map_create(union bpf_attr *attr) 555static int map_create(union bpf_attr *attr)
550{ 556{
551 int numa_node = bpf_map_attr_numa_node(attr); 557 int numa_node = bpf_map_attr_numa_node(attr);
558 struct bpf_map_memory mem;
552 struct bpf_map *map; 559 struct bpf_map *map;
553 int f_flags; 560 int f_flags;
554 int err; 561 int err;
@@ -573,7 +580,7 @@ static int map_create(union bpf_attr *attr)
573 580
574 err = bpf_obj_name_cpy(map->name, attr->map_name); 581 err = bpf_obj_name_cpy(map->name, attr->map_name);
575 if (err) 582 if (err)
576 goto free_map_nouncharge; 583 goto free_map;
577 584
578 atomic_set(&map->refcnt, 1); 585 atomic_set(&map->refcnt, 1);
579 atomic_set(&map->usercnt, 1); 586 atomic_set(&map->usercnt, 1);
@@ -583,20 +590,20 @@ static int map_create(union bpf_attr *attr)
583 590
584 if (!attr->btf_value_type_id) { 591 if (!attr->btf_value_type_id) {
585 err = -EINVAL; 592 err = -EINVAL;
586 goto free_map_nouncharge; 593 goto free_map;
587 } 594 }
588 595
589 btf = btf_get_by_fd(attr->btf_fd); 596 btf = btf_get_by_fd(attr->btf_fd);
590 if (IS_ERR(btf)) { 597 if (IS_ERR(btf)) {
591 err = PTR_ERR(btf); 598 err = PTR_ERR(btf);
592 goto free_map_nouncharge; 599 goto free_map;
593 } 600 }
594 601
595 err = map_check_btf(map, btf, attr->btf_key_type_id, 602 err = map_check_btf(map, btf, attr->btf_key_type_id,
596 attr->btf_value_type_id); 603 attr->btf_value_type_id);
597 if (err) { 604 if (err) {
598 btf_put(btf); 605 btf_put(btf);
599 goto free_map_nouncharge; 606 goto free_map;
600 } 607 }
601 608
602 map->btf = btf; 609 map->btf = btf;
@@ -608,15 +615,11 @@ static int map_create(union bpf_attr *attr)
608 615
609 err = security_bpf_map_alloc(map); 616 err = security_bpf_map_alloc(map);
610 if (err) 617 if (err)
611 goto free_map_nouncharge; 618 goto free_map;
612
613 err = bpf_map_init_memlock(map);
614 if (err)
615 goto free_map_sec;
616 619
617 err = bpf_map_alloc_id(map); 620 err = bpf_map_alloc_id(map);
618 if (err) 621 if (err)
619 goto free_map; 622 goto free_map_sec;
620 623
621 err = bpf_map_new_fd(map, f_flags); 624 err = bpf_map_new_fd(map, f_flags);
622 if (err < 0) { 625 if (err < 0) {
@@ -632,13 +635,13 @@ static int map_create(union bpf_attr *attr)
632 635
633 return err; 636 return err;
634 637
635free_map:
636 bpf_map_release_memlock(map);
637free_map_sec: 638free_map_sec:
638 security_bpf_map_free(map); 639 security_bpf_map_free(map);
639free_map_nouncharge: 640free_map:
640 btf_put(map->btf); 641 btf_put(map->btf);
642 bpf_map_charge_move(&mem, &map->memory);
641 map->ops->map_free(map); 643 map->ops->map_free(map);
644 bpf_map_charge_finish(&mem);
642 return err; 645 return err;
643} 646}
644 647
@@ -1585,6 +1588,14 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
1585 default: 1588 default:
1586 return -EINVAL; 1589 return -EINVAL;
1587 } 1590 }
1591 case BPF_PROG_TYPE_CGROUP_SKB:
1592 switch (expected_attach_type) {
1593 case BPF_CGROUP_INET_INGRESS:
1594 case BPF_CGROUP_INET_EGRESS:
1595 return 0;
1596 default:
1597 return -EINVAL;
1598 }
1588 default: 1599 default:
1589 return 0; 1600 return 0;
1590 } 1601 }
@@ -1604,7 +1615,9 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
1604 if (CHECK_ATTR(BPF_PROG_LOAD)) 1615 if (CHECK_ATTR(BPF_PROG_LOAD))
1605 return -EINVAL; 1616 return -EINVAL;
1606 1617
1607 if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT)) 1618 if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
1619 BPF_F_ANY_ALIGNMENT |
1620 BPF_F_TEST_RND_HI32))
1608 return -EINVAL; 1621 return -EINVAL;
1609 1622
1610 if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && 1623 if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
@@ -1834,6 +1847,10 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
1834 case BPF_PROG_TYPE_CGROUP_SOCK: 1847 case BPF_PROG_TYPE_CGROUP_SOCK:
1835 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: 1848 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
1836 return attach_type == prog->expected_attach_type ? 0 : -EINVAL; 1849 return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
1850 case BPF_PROG_TYPE_CGROUP_SKB:
1851 return prog->enforce_expected_attach_type &&
1852 prog->expected_attach_type != attach_type ?
1853 -EINVAL : 0;
1837 default: 1854 default:
1838 return 0; 1855 return 0;
1839 } 1856 }
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 95f9354495ad..5c2cb5bd84ce 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -176,7 +176,7 @@ struct bpf_verifier_stack_elem {
176 struct bpf_verifier_stack_elem *next; 176 struct bpf_verifier_stack_elem *next;
177}; 177};
178 178
179#define BPF_COMPLEXITY_LIMIT_STACK 1024 179#define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192
180#define BPF_COMPLEXITY_LIMIT_STATES 64 180#define BPF_COMPLEXITY_LIMIT_STATES 64
181 181
182#define BPF_MAP_PTR_UNPRIV 1UL 182#define BPF_MAP_PTR_UNPRIV 1UL
@@ -782,8 +782,9 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
782 if (err) 782 if (err)
783 goto err; 783 goto err;
784 elem->st.speculative |= speculative; 784 elem->st.speculative |= speculative;
785 if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) { 785 if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
786 verbose(env, "BPF program is too complex\n"); 786 verbose(env, "The sequence of %d jumps is too complex.\n",
787 env->stack_size);
787 goto err; 788 goto err;
788 } 789 }
789 return &elem->st; 790 return &elem->st;
@@ -981,6 +982,7 @@ static void mark_reg_not_init(struct bpf_verifier_env *env,
981 __mark_reg_not_init(regs + regno); 982 __mark_reg_not_init(regs + regno);
982} 983}
983 984
985#define DEF_NOT_SUBREG (0)
984static void init_reg_state(struct bpf_verifier_env *env, 986static void init_reg_state(struct bpf_verifier_env *env,
985 struct bpf_func_state *state) 987 struct bpf_func_state *state)
986{ 988{
@@ -991,6 +993,7 @@ static void init_reg_state(struct bpf_verifier_env *env,
991 mark_reg_not_init(env, regs, i); 993 mark_reg_not_init(env, regs, i);
992 regs[i].live = REG_LIVE_NONE; 994 regs[i].live = REG_LIVE_NONE;
993 regs[i].parent = NULL; 995 regs[i].parent = NULL;
996 regs[i].subreg_def = DEF_NOT_SUBREG;
994 } 997 }
995 998
996 /* frame pointer */ 999 /* frame pointer */
@@ -1136,7 +1139,7 @@ next:
1136 */ 1139 */
1137static int mark_reg_read(struct bpf_verifier_env *env, 1140static int mark_reg_read(struct bpf_verifier_env *env,
1138 const struct bpf_reg_state *state, 1141 const struct bpf_reg_state *state,
1139 struct bpf_reg_state *parent) 1142 struct bpf_reg_state *parent, u8 flag)
1140{ 1143{
1141 bool writes = parent == state->parent; /* Observe write marks */ 1144 bool writes = parent == state->parent; /* Observe write marks */
1142 int cnt = 0; 1145 int cnt = 0;
@@ -1151,17 +1154,26 @@ static int mark_reg_read(struct bpf_verifier_env *env,
1151 parent->var_off.value, parent->off); 1154 parent->var_off.value, parent->off);
1152 return -EFAULT; 1155 return -EFAULT;
1153 } 1156 }
1154 if (parent->live & REG_LIVE_READ) 1157 /* The first condition is more likely to be true than the
1158 * second, checked it first.
1159 */
1160 if ((parent->live & REG_LIVE_READ) == flag ||
1161 parent->live & REG_LIVE_READ64)
1155 /* The parentage chain never changes and 1162 /* The parentage chain never changes and
1156 * this parent was already marked as LIVE_READ. 1163 * this parent was already marked as LIVE_READ.
1157 * There is no need to keep walking the chain again and 1164 * There is no need to keep walking the chain again and
1158 * keep re-marking all parents as LIVE_READ. 1165 * keep re-marking all parents as LIVE_READ.
1159 * This case happens when the same register is read 1166 * This case happens when the same register is read
1160 * multiple times without writes into it in-between. 1167 * multiple times without writes into it in-between.
1168 * Also, if parent has the stronger REG_LIVE_READ64 set,
1169 * then no need to set the weak REG_LIVE_READ32.
1161 */ 1170 */
1162 break; 1171 break;
1163 /* ... then we depend on parent's value */ 1172 /* ... then we depend on parent's value */
1164 parent->live |= REG_LIVE_READ; 1173 parent->live |= flag;
1174 /* REG_LIVE_READ64 overrides REG_LIVE_READ32. */
1175 if (flag == REG_LIVE_READ64)
1176 parent->live &= ~REG_LIVE_READ32;
1165 state = parent; 1177 state = parent;
1166 parent = state->parent; 1178 parent = state->parent;
1167 writes = true; 1179 writes = true;
@@ -1173,12 +1185,129 @@ static int mark_reg_read(struct bpf_verifier_env *env,
1173 return 0; 1185 return 0;
1174} 1186}
1175 1187
1188/* This function is supposed to be used by the following 32-bit optimization
1189 * code only. It returns TRUE if the source or destination register operates
1190 * on 64-bit, otherwise return FALSE.
1191 */
1192static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
1193 u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t)
1194{
1195 u8 code, class, op;
1196
1197 code = insn->code;
1198 class = BPF_CLASS(code);
1199 op = BPF_OP(code);
1200 if (class == BPF_JMP) {
1201 /* BPF_EXIT for "main" will reach here. Return TRUE
1202 * conservatively.
1203 */
1204 if (op == BPF_EXIT)
1205 return true;
1206 if (op == BPF_CALL) {
1207 /* BPF to BPF call will reach here because of marking
1208 * caller saved clobber with DST_OP_NO_MARK for which we
1209 * don't care the register def because they are anyway
1210 * marked as NOT_INIT already.
1211 */
1212 if (insn->src_reg == BPF_PSEUDO_CALL)
1213 return false;
1214 /* Helper call will reach here because of arg type
1215 * check, conservatively return TRUE.
1216 */
1217 if (t == SRC_OP)
1218 return true;
1219
1220 return false;
1221 }
1222 }
1223
1224 if (class == BPF_ALU64 || class == BPF_JMP ||
1225 /* BPF_END always use BPF_ALU class. */
1226 (class == BPF_ALU && op == BPF_END && insn->imm == 64))
1227 return true;
1228
1229 if (class == BPF_ALU || class == BPF_JMP32)
1230 return false;
1231
1232 if (class == BPF_LDX) {
1233 if (t != SRC_OP)
1234 return BPF_SIZE(code) == BPF_DW;
1235 /* LDX source must be ptr. */
1236 return true;
1237 }
1238
1239 if (class == BPF_STX) {
1240 if (reg->type != SCALAR_VALUE)
1241 return true;
1242 return BPF_SIZE(code) == BPF_DW;
1243 }
1244
1245 if (class == BPF_LD) {
1246 u8 mode = BPF_MODE(code);
1247
1248 /* LD_IMM64 */
1249 if (mode == BPF_IMM)
1250 return true;
1251
1252 /* Both LD_IND and LD_ABS return 32-bit data. */
1253 if (t != SRC_OP)
1254 return false;
1255
1256 /* Implicit ctx ptr. */
1257 if (regno == BPF_REG_6)
1258 return true;
1259
1260 /* Explicit source could be any width. */
1261 return true;
1262 }
1263
1264 if (class == BPF_ST)
1265 /* The only source register for BPF_ST is a ptr. */
1266 return true;
1267
1268 /* Conservatively return true at default. */
1269 return true;
1270}
1271
1272/* Return TRUE if INSN doesn't have explicit value define. */
1273static bool insn_no_def(struct bpf_insn *insn)
1274{
1275 u8 class = BPF_CLASS(insn->code);
1276
1277 return (class == BPF_JMP || class == BPF_JMP32 ||
1278 class == BPF_STX || class == BPF_ST);
1279}
1280
1281/* Return TRUE if INSN has defined any 32-bit value explicitly. */
1282static bool insn_has_def32(struct bpf_verifier_env *env, struct bpf_insn *insn)
1283{
1284 if (insn_no_def(insn))
1285 return false;
1286
1287 return !is_reg64(env, insn, insn->dst_reg, NULL, DST_OP);
1288}
1289
1290static void mark_insn_zext(struct bpf_verifier_env *env,
1291 struct bpf_reg_state *reg)
1292{
1293 s32 def_idx = reg->subreg_def;
1294
1295 if (def_idx == DEF_NOT_SUBREG)
1296 return;
1297
1298 env->insn_aux_data[def_idx - 1].zext_dst = true;
1299 /* The dst will be zero extended, so won't be sub-register anymore. */
1300 reg->subreg_def = DEF_NOT_SUBREG;
1301}
1302
1176static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, 1303static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
1177 enum reg_arg_type t) 1304 enum reg_arg_type t)
1178{ 1305{
1179 struct bpf_verifier_state *vstate = env->cur_state; 1306 struct bpf_verifier_state *vstate = env->cur_state;
1180 struct bpf_func_state *state = vstate->frame[vstate->curframe]; 1307 struct bpf_func_state *state = vstate->frame[vstate->curframe];
1308 struct bpf_insn *insn = env->prog->insnsi + env->insn_idx;
1181 struct bpf_reg_state *reg, *regs = state->regs; 1309 struct bpf_reg_state *reg, *regs = state->regs;
1310 bool rw64;
1182 1311
1183 if (regno >= MAX_BPF_REG) { 1312 if (regno >= MAX_BPF_REG) {
1184 verbose(env, "R%d is invalid\n", regno); 1313 verbose(env, "R%d is invalid\n", regno);
@@ -1186,6 +1315,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
1186 } 1315 }
1187 1316
1188 reg = &regs[regno]; 1317 reg = &regs[regno];
1318 rw64 = is_reg64(env, insn, regno, reg, t);
1189 if (t == SRC_OP) { 1319 if (t == SRC_OP) {
1190 /* check whether register used as source operand can be read */ 1320 /* check whether register used as source operand can be read */
1191 if (reg->type == NOT_INIT) { 1321 if (reg->type == NOT_INIT) {
@@ -1196,7 +1326,11 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
1196 if (regno == BPF_REG_FP) 1326 if (regno == BPF_REG_FP)
1197 return 0; 1327 return 0;
1198 1328
1199 return mark_reg_read(env, reg, reg->parent); 1329 if (rw64)
1330 mark_insn_zext(env, reg);
1331
1332 return mark_reg_read(env, reg, reg->parent,
1333 rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32);
1200 } else { 1334 } else {
1201 /* check whether register used as dest operand can be written to */ 1335 /* check whether register used as dest operand can be written to */
1202 if (regno == BPF_REG_FP) { 1336 if (regno == BPF_REG_FP) {
@@ -1204,6 +1338,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
1204 return -EACCES; 1338 return -EACCES;
1205 } 1339 }
1206 reg->live |= REG_LIVE_WRITTEN; 1340 reg->live |= REG_LIVE_WRITTEN;
1341 reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx + 1;
1207 if (t == DST_OP) 1342 if (t == DST_OP)
1208 mark_reg_unknown(env, regs, regno); 1343 mark_reg_unknown(env, regs, regno);
1209 } 1344 }
@@ -1383,7 +1518,8 @@ static int check_stack_read(struct bpf_verifier_env *env,
1383 state->regs[value_regno].live |= REG_LIVE_WRITTEN; 1518 state->regs[value_regno].live |= REG_LIVE_WRITTEN;
1384 } 1519 }
1385 mark_reg_read(env, &reg_state->stack[spi].spilled_ptr, 1520 mark_reg_read(env, &reg_state->stack[spi].spilled_ptr,
1386 reg_state->stack[spi].spilled_ptr.parent); 1521 reg_state->stack[spi].spilled_ptr.parent,
1522 REG_LIVE_READ64);
1387 return 0; 1523 return 0;
1388 } else { 1524 } else {
1389 int zeros = 0; 1525 int zeros = 0;
@@ -1400,7 +1536,8 @@ static int check_stack_read(struct bpf_verifier_env *env,
1400 return -EACCES; 1536 return -EACCES;
1401 } 1537 }
1402 mark_reg_read(env, &reg_state->stack[spi].spilled_ptr, 1538 mark_reg_read(env, &reg_state->stack[spi].spilled_ptr,
1403 reg_state->stack[spi].spilled_ptr.parent); 1539 reg_state->stack[spi].spilled_ptr.parent,
1540 REG_LIVE_READ64);
1404 if (value_regno >= 0) { 1541 if (value_regno >= 0) {
1405 if (zeros == size) { 1542 if (zeros == size) {
1406 /* any size read into register is zero extended, 1543 /* any size read into register is zero extended,
@@ -2109,6 +2246,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
2109 value_regno); 2246 value_regno);
2110 if (reg_type_may_be_null(reg_type)) 2247 if (reg_type_may_be_null(reg_type))
2111 regs[value_regno].id = ++env->id_gen; 2248 regs[value_regno].id = ++env->id_gen;
2249 /* A load of ctx field could have different
2250 * actual load size with the one encoded in the
2251 * insn. When the dst is PTR, it is for sure not
2252 * a sub-register.
2253 */
2254 regs[value_regno].subreg_def = DEF_NOT_SUBREG;
2112 } 2255 }
2113 regs[value_regno].type = reg_type; 2256 regs[value_regno].type = reg_type;
2114 } 2257 }
@@ -2368,7 +2511,8 @@ mark:
2368 * the whole slot to be marked as 'read' 2511 * the whole slot to be marked as 'read'
2369 */ 2512 */
2370 mark_reg_read(env, &state->stack[spi].spilled_ptr, 2513 mark_reg_read(env, &state->stack[spi].spilled_ptr,
2371 state->stack[spi].spilled_ptr.parent); 2514 state->stack[spi].spilled_ptr.parent,
2515 REG_LIVE_READ64);
2372 } 2516 }
2373 return update_stack_depth(env, state, min_off); 2517 return update_stack_depth(env, state, min_off);
2374} 2518}
@@ -3332,6 +3476,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
3332 check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); 3476 check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
3333 } 3477 }
3334 3478
3479 /* helper call returns 64-bit value. */
3480 regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
3481
3335 /* update return register (already marked as written above) */ 3482 /* update return register (already marked as written above) */
3336 if (fn->ret_type == RET_INTEGER) { 3483 if (fn->ret_type == RET_INTEGER) {
3337 /* sets type to SCALAR_VALUE */ 3484 /* sets type to SCALAR_VALUE */
@@ -4263,6 +4410,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
4263 */ 4410 */
4264 *dst_reg = *src_reg; 4411 *dst_reg = *src_reg;
4265 dst_reg->live |= REG_LIVE_WRITTEN; 4412 dst_reg->live |= REG_LIVE_WRITTEN;
4413 dst_reg->subreg_def = DEF_NOT_SUBREG;
4266 } else { 4414 } else {
4267 /* R1 = (u32) R2 */ 4415 /* R1 = (u32) R2 */
4268 if (is_pointer_value(env, insn->src_reg)) { 4416 if (is_pointer_value(env, insn->src_reg)) {
@@ -4273,6 +4421,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
4273 } else if (src_reg->type == SCALAR_VALUE) { 4421 } else if (src_reg->type == SCALAR_VALUE) {
4274 *dst_reg = *src_reg; 4422 *dst_reg = *src_reg;
4275 dst_reg->live |= REG_LIVE_WRITTEN; 4423 dst_reg->live |= REG_LIVE_WRITTEN;
4424 dst_reg->subreg_def = env->insn_idx + 1;
4276 } else { 4425 } else {
4277 mark_reg_unknown(env, regs, 4426 mark_reg_unknown(env, regs,
4278 insn->dst_reg); 4427 insn->dst_reg);
@@ -5352,16 +5501,23 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
5352 * Already marked as written above. 5501 * Already marked as written above.
5353 */ 5502 */
5354 mark_reg_unknown(env, regs, BPF_REG_0); 5503 mark_reg_unknown(env, regs, BPF_REG_0);
5504 /* ld_abs load up to 32-bit skb data. */
5505 regs[BPF_REG_0].subreg_def = env->insn_idx + 1;
5355 return 0; 5506 return 0;
5356} 5507}
5357 5508
5358static int check_return_code(struct bpf_verifier_env *env) 5509static int check_return_code(struct bpf_verifier_env *env)
5359{ 5510{
5511 struct tnum enforce_attach_type_range = tnum_unknown;
5360 struct bpf_reg_state *reg; 5512 struct bpf_reg_state *reg;
5361 struct tnum range = tnum_range(0, 1); 5513 struct tnum range = tnum_range(0, 1);
5362 5514
5363 switch (env->prog->type) { 5515 switch (env->prog->type) {
5364 case BPF_PROG_TYPE_CGROUP_SKB: 5516 case BPF_PROG_TYPE_CGROUP_SKB:
5517 if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) {
5518 range = tnum_range(0, 3);
5519 enforce_attach_type_range = tnum_range(2, 3);
5520 }
5365 case BPF_PROG_TYPE_CGROUP_SOCK: 5521 case BPF_PROG_TYPE_CGROUP_SOCK:
5366 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: 5522 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
5367 case BPF_PROG_TYPE_SOCK_OPS: 5523 case BPF_PROG_TYPE_SOCK_OPS:
@@ -5380,18 +5536,23 @@ static int check_return_code(struct bpf_verifier_env *env)
5380 } 5536 }
5381 5537
5382 if (!tnum_in(range, reg->var_off)) { 5538 if (!tnum_in(range, reg->var_off)) {
5539 char tn_buf[48];
5540
5383 verbose(env, "At program exit the register R0 "); 5541 verbose(env, "At program exit the register R0 ");
5384 if (!tnum_is_unknown(reg->var_off)) { 5542 if (!tnum_is_unknown(reg->var_off)) {
5385 char tn_buf[48];
5386
5387 tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); 5543 tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
5388 verbose(env, "has value %s", tn_buf); 5544 verbose(env, "has value %s", tn_buf);
5389 } else { 5545 } else {
5390 verbose(env, "has unknown scalar value"); 5546 verbose(env, "has unknown scalar value");
5391 } 5547 }
5392 verbose(env, " should have been 0 or 1\n"); 5548 tnum_strn(tn_buf, sizeof(tn_buf), range);
5549 verbose(env, " should have been %s\n", tn_buf);
5393 return -EINVAL; 5550 return -EINVAL;
5394 } 5551 }
5552
5553 if (!tnum_is_unknown(enforce_attach_type_range) &&
5554 tnum_in(enforce_attach_type_range, reg->var_off))
5555 env->prog->enforce_expected_attach_type = 1;
5395 return 0; 5556 return 0;
5396} 5557}
5397 5558
@@ -5435,7 +5596,25 @@ enum {
5435 BRANCH = 2, 5596 BRANCH = 2,
5436}; 5597};
5437 5598
5438#define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L) 5599static u32 state_htab_size(struct bpf_verifier_env *env)
5600{
5601 return env->prog->len;
5602}
5603
5604static struct bpf_verifier_state_list **explored_state(
5605 struct bpf_verifier_env *env,
5606 int idx)
5607{
5608 struct bpf_verifier_state *cur = env->cur_state;
5609 struct bpf_func_state *state = cur->frame[cur->curframe];
5610
5611 return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];
5612}
5613
5614static void init_explored_state(struct bpf_verifier_env *env, int idx)
5615{
5616 env->insn_aux_data[idx].prune_point = true;
5617}
5439 5618
5440/* t, w, e - match pseudo-code above: 5619/* t, w, e - match pseudo-code above:
5441 * t - index of current instruction 5620 * t - index of current instruction
@@ -5461,7 +5640,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
5461 5640
5462 if (e == BRANCH) 5641 if (e == BRANCH)
5463 /* mark branch target for state pruning */ 5642 /* mark branch target for state pruning */
5464 env->explored_states[w] = STATE_LIST_MARK; 5643 init_explored_state(env, w);
5465 5644
5466 if (insn_state[w] == 0) { 5645 if (insn_state[w] == 0) {
5467 /* tree-edge */ 5646 /* tree-edge */
@@ -5529,9 +5708,9 @@ peek_stack:
5529 else if (ret < 0) 5708 else if (ret < 0)
5530 goto err_free; 5709 goto err_free;
5531 if (t + 1 < insn_cnt) 5710 if (t + 1 < insn_cnt)
5532 env->explored_states[t + 1] = STATE_LIST_MARK; 5711 init_explored_state(env, t + 1);
5533 if (insns[t].src_reg == BPF_PSEUDO_CALL) { 5712 if (insns[t].src_reg == BPF_PSEUDO_CALL) {
5534 env->explored_states[t] = STATE_LIST_MARK; 5713 init_explored_state(env, t);
5535 ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env); 5714 ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env);
5536 if (ret == 1) 5715 if (ret == 1)
5537 goto peek_stack; 5716 goto peek_stack;
@@ -5554,10 +5733,10 @@ peek_stack:
5554 * after every call and jump 5733 * after every call and jump
5555 */ 5734 */
5556 if (t + 1 < insn_cnt) 5735 if (t + 1 < insn_cnt)
5557 env->explored_states[t + 1] = STATE_LIST_MARK; 5736 init_explored_state(env, t + 1);
5558 } else { 5737 } else {
5559 /* conditional jump with two edges */ 5738 /* conditional jump with two edges */
5560 env->explored_states[t] = STATE_LIST_MARK; 5739 init_explored_state(env, t);
5561 ret = push_insn(t, t + 1, FALLTHROUGH, env); 5740 ret = push_insn(t, t + 1, FALLTHROUGH, env);
5562 if (ret == 1) 5741 if (ret == 1)
5563 goto peek_stack; 5742 goto peek_stack;
@@ -6005,12 +6184,10 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn,
6005 struct bpf_verifier_state_list *sl; 6184 struct bpf_verifier_state_list *sl;
6006 int i; 6185 int i;
6007 6186
6008 sl = env->explored_states[insn]; 6187 sl = *explored_state(env, insn);
6009 if (!sl) 6188 while (sl) {
6010 return; 6189 if (sl->state.insn_idx != insn ||
6011 6190 sl->state.curframe != cur->curframe)
6012 while (sl != STATE_LIST_MARK) {
6013 if (sl->state.curframe != cur->curframe)
6014 goto next; 6191 goto next;
6015 for (i = 0; i <= cur->curframe; i++) 6192 for (i = 0; i <= cur->curframe; i++)
6016 if (sl->state.frame[i]->callsite != cur->frame[i]->callsite) 6193 if (sl->state.frame[i]->callsite != cur->frame[i]->callsite)
@@ -6292,20 +6469,33 @@ static bool states_equal(struct bpf_verifier_env *env,
6292 return true; 6469 return true;
6293} 6470}
6294 6471
6472/* Return 0 if no propagation happened. Return negative error code if error
6473 * happened. Otherwise, return the propagated bit.
6474 */
6295static int propagate_liveness_reg(struct bpf_verifier_env *env, 6475static int propagate_liveness_reg(struct bpf_verifier_env *env,
6296 struct bpf_reg_state *reg, 6476 struct bpf_reg_state *reg,
6297 struct bpf_reg_state *parent_reg) 6477 struct bpf_reg_state *parent_reg)
6298{ 6478{
6479 u8 parent_flag = parent_reg->live & REG_LIVE_READ;
6480 u8 flag = reg->live & REG_LIVE_READ;
6299 int err; 6481 int err;
6300 6482
6301 if (parent_reg->live & REG_LIVE_READ || !(reg->live & REG_LIVE_READ)) 6483 /* When comes here, read flags of PARENT_REG or REG could be any of
6484 * REG_LIVE_READ64, REG_LIVE_READ32, REG_LIVE_NONE. There is no need
6485 * of propagation if PARENT_REG has strongest REG_LIVE_READ64.
6486 */
6487 if (parent_flag == REG_LIVE_READ64 ||
6488 /* Or if there is no read flag from REG. */
6489 !flag ||
6490 /* Or if the read flag from REG is the same as PARENT_REG. */
6491 parent_flag == flag)
6302 return 0; 6492 return 0;
6303 6493
6304 err = mark_reg_read(env, reg, parent_reg); 6494 err = mark_reg_read(env, reg, parent_reg, flag);
6305 if (err) 6495 if (err)
6306 return err; 6496 return err;
6307 6497
6308 return 0; 6498 return flag;
6309} 6499}
6310 6500
6311/* A write screens off any subsequent reads; but write marks come from the 6501/* A write screens off any subsequent reads; but write marks come from the
@@ -6339,8 +6529,10 @@ static int propagate_liveness(struct bpf_verifier_env *env,
6339 for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) { 6529 for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) {
6340 err = propagate_liveness_reg(env, &state_reg[i], 6530 err = propagate_liveness_reg(env, &state_reg[i],
6341 &parent_reg[i]); 6531 &parent_reg[i]);
6342 if (err) 6532 if (err < 0)
6343 return err; 6533 return err;
6534 if (err == REG_LIVE_READ64)
6535 mark_insn_zext(env, &parent_reg[i]);
6344 } 6536 }
6345 6537
6346 /* Propagate stack slots. */ 6538 /* Propagate stack slots. */
@@ -6350,11 +6542,11 @@ static int propagate_liveness(struct bpf_verifier_env *env,
6350 state_reg = &state->stack[i].spilled_ptr; 6542 state_reg = &state->stack[i].spilled_ptr;
6351 err = propagate_liveness_reg(env, state_reg, 6543 err = propagate_liveness_reg(env, state_reg,
6352 parent_reg); 6544 parent_reg);
6353 if (err) 6545 if (err < 0)
6354 return err; 6546 return err;
6355 } 6547 }
6356 } 6548 }
6357 return err; 6549 return 0;
6358} 6550}
6359 6551
6360static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) 6552static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
@@ -6364,18 +6556,21 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
6364 struct bpf_verifier_state *cur = env->cur_state, *new; 6556 struct bpf_verifier_state *cur = env->cur_state, *new;
6365 int i, j, err, states_cnt = 0; 6557 int i, j, err, states_cnt = 0;
6366 6558
6367 pprev = &env->explored_states[insn_idx]; 6559 if (!env->insn_aux_data[insn_idx].prune_point)
6368 sl = *pprev;
6369
6370 if (!sl)
6371 /* this 'insn_idx' instruction wasn't marked, so we will not 6560 /* this 'insn_idx' instruction wasn't marked, so we will not
6372 * be doing state search here 6561 * be doing state search here
6373 */ 6562 */
6374 return 0; 6563 return 0;
6375 6564
6565 pprev = explored_state(env, insn_idx);
6566 sl = *pprev;
6567
6376 clean_live_states(env, insn_idx, cur); 6568 clean_live_states(env, insn_idx, cur);
6377 6569
6378 while (sl != STATE_LIST_MARK) { 6570 while (sl) {
6571 states_cnt++;
6572 if (sl->state.insn_idx != insn_idx)
6573 goto next;
6379 if (states_equal(env, &sl->state, cur)) { 6574 if (states_equal(env, &sl->state, cur)) {
6380 sl->hit_cnt++; 6575 sl->hit_cnt++;
6381 /* reached equivalent register/stack state, 6576 /* reached equivalent register/stack state,
@@ -6393,7 +6588,6 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
6393 return err; 6588 return err;
6394 return 1; 6589 return 1;
6395 } 6590 }
6396 states_cnt++;
6397 sl->miss_cnt++; 6591 sl->miss_cnt++;
6398 /* heuristic to determine whether this state is beneficial 6592 /* heuristic to determine whether this state is beneficial
6399 * to keep checking from state equivalence point of view. 6593 * to keep checking from state equivalence point of view.
@@ -6420,6 +6614,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
6420 sl = *pprev; 6614 sl = *pprev;
6421 continue; 6615 continue;
6422 } 6616 }
6617next:
6423 pprev = &sl->next; 6618 pprev = &sl->next;
6424 sl = *pprev; 6619 sl = *pprev;
6425 } 6620 }
@@ -6451,8 +6646,9 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
6451 kfree(new_sl); 6646 kfree(new_sl);
6452 return err; 6647 return err;
6453 } 6648 }
6454 new_sl->next = env->explored_states[insn_idx]; 6649 new->insn_idx = insn_idx;
6455 env->explored_states[insn_idx] = new_sl; 6650 new_sl->next = *explored_state(env, insn_idx);
6651 *explored_state(env, insn_idx) = new_sl;
6456 /* connect new state to parentage chain. Current frame needs all 6652 /* connect new state to parentage chain. Current frame needs all
6457 * registers connected. Only r6 - r9 of the callers are alive (pushed 6653 * registers connected. Only r6 - r9 of the callers are alive (pushed
6458 * to the stack implicitly by JITs) so in callers' frames connect just 6654 * to the stack implicitly by JITs) so in callers' frames connect just
@@ -7130,14 +7326,23 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
7130 * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying 7326 * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying
7131 * [0, off) and [off, end) to new locations, so the patched range stays zero 7327 * [0, off) and [off, end) to new locations, so the patched range stays zero
7132 */ 7328 */
7133static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, 7329static int adjust_insn_aux_data(struct bpf_verifier_env *env,
7134 u32 off, u32 cnt) 7330 struct bpf_prog *new_prog, u32 off, u32 cnt)
7135{ 7331{
7136 struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; 7332 struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data;
7333 struct bpf_insn *insn = new_prog->insnsi;
7334 u32 prog_len;
7137 int i; 7335 int i;
7138 7336
7337 /* aux info at OFF always needs adjustment, no matter fast path
7338 * (cnt == 1) is taken or not. There is no guarantee INSN at OFF is the
7339 * original insn at old prog.
7340 */
7341 old_data[off].zext_dst = insn_has_def32(env, insn + off + cnt - 1);
7342
7139 if (cnt == 1) 7343 if (cnt == 1)
7140 return 0; 7344 return 0;
7345 prog_len = new_prog->len;
7141 new_data = vzalloc(array_size(prog_len, 7346 new_data = vzalloc(array_size(prog_len,
7142 sizeof(struct bpf_insn_aux_data))); 7347 sizeof(struct bpf_insn_aux_data)));
7143 if (!new_data) 7348 if (!new_data)
@@ -7145,8 +7350,10 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
7145 memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off); 7350 memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
7146 memcpy(new_data + off + cnt - 1, old_data + off, 7351 memcpy(new_data + off + cnt - 1, old_data + off,
7147 sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); 7352 sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
7148 for (i = off; i < off + cnt - 1; i++) 7353 for (i = off; i < off + cnt - 1; i++) {
7149 new_data[i].seen = true; 7354 new_data[i].seen = true;
7355 new_data[i].zext_dst = insn_has_def32(env, insn + i);
7356 }
7150 env->insn_aux_data = new_data; 7357 env->insn_aux_data = new_data;
7151 vfree(old_data); 7358 vfree(old_data);
7152 return 0; 7359 return 0;
@@ -7179,7 +7386,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
7179 env->insn_aux_data[off].orig_idx); 7386 env->insn_aux_data[off].orig_idx);
7180 return NULL; 7387 return NULL;
7181 } 7388 }
7182 if (adjust_insn_aux_data(env, new_prog->len, off, len)) 7389 if (adjust_insn_aux_data(env, new_prog, off, len))
7183 return NULL; 7390 return NULL;
7184 adjust_subprog_starts(env, off, len); 7391 adjust_subprog_starts(env, off, len);
7185 return new_prog; 7392 return new_prog;
@@ -7443,6 +7650,84 @@ static int opt_remove_nops(struct bpf_verifier_env *env)
7443 return 0; 7650 return 0;
7444} 7651}
7445 7652
7653static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
7654 const union bpf_attr *attr)
7655{
7656 struct bpf_insn *patch, zext_patch[2], rnd_hi32_patch[4];
7657 struct bpf_insn_aux_data *aux = env->insn_aux_data;
7658 int i, patch_len, delta = 0, len = env->prog->len;
7659 struct bpf_insn *insns = env->prog->insnsi;
7660 struct bpf_prog *new_prog;
7661 bool rnd_hi32;
7662
7663 rnd_hi32 = attr->prog_flags & BPF_F_TEST_RND_HI32;
7664 zext_patch[1] = BPF_ZEXT_REG(0);
7665 rnd_hi32_patch[1] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, 0);
7666 rnd_hi32_patch[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32);
7667 rnd_hi32_patch[3] = BPF_ALU64_REG(BPF_OR, 0, BPF_REG_AX);
7668 for (i = 0; i < len; i++) {
7669 int adj_idx = i + delta;
7670 struct bpf_insn insn;
7671
7672 insn = insns[adj_idx];
7673 if (!aux[adj_idx].zext_dst) {
7674 u8 code, class;
7675 u32 imm_rnd;
7676
7677 if (!rnd_hi32)
7678 continue;
7679
7680 code = insn.code;
7681 class = BPF_CLASS(code);
7682 if (insn_no_def(&insn))
7683 continue;
7684
7685 /* NOTE: arg "reg" (the fourth one) is only used for
7686 * BPF_STX which has been ruled out in above
7687 * check, it is safe to pass NULL here.
7688 */
7689 if (is_reg64(env, &insn, insn.dst_reg, NULL, DST_OP)) {
7690 if (class == BPF_LD &&
7691 BPF_MODE(code) == BPF_IMM)
7692 i++;
7693 continue;
7694 }
7695
7696 /* ctx load could be transformed into wider load. */
7697 if (class == BPF_LDX &&
7698 aux[adj_idx].ptr_type == PTR_TO_CTX)
7699 continue;
7700
7701 imm_rnd = get_random_int();
7702 rnd_hi32_patch[0] = insn;
7703 rnd_hi32_patch[1].imm = imm_rnd;
7704 rnd_hi32_patch[3].dst_reg = insn.dst_reg;
7705 patch = rnd_hi32_patch;
7706 patch_len = 4;
7707 goto apply_patch_buffer;
7708 }
7709
7710 if (!bpf_jit_needs_zext())
7711 continue;
7712
7713 zext_patch[0] = insn;
7714 zext_patch[1].dst_reg = insn.dst_reg;
7715 zext_patch[1].src_reg = insn.dst_reg;
7716 patch = zext_patch;
7717 patch_len = 2;
7718apply_patch_buffer:
7719 new_prog = bpf_patch_insn_data(env, adj_idx, patch, patch_len);
7720 if (!new_prog)
7721 return -ENOMEM;
7722 env->prog = new_prog;
7723 insns = new_prog->insnsi;
7724 aux = env->insn_aux_data;
7725 delta += patch_len - 1;
7726 }
7727
7728 return 0;
7729}
7730
7446/* convert load instructions that access fields of a context type into a 7731/* convert load instructions that access fields of a context type into a
7447 * sequence of instructions that access fields of the underlying structure: 7732 * sequence of instructions that access fields of the underlying structure:
7448 * struct __sk_buff -> struct sk_buff 7733 * struct __sk_buff -> struct sk_buff
@@ -8130,16 +8415,15 @@ static void free_states(struct bpf_verifier_env *env)
8130 if (!env->explored_states) 8415 if (!env->explored_states)
8131 return; 8416 return;
8132 8417
8133 for (i = 0; i < env->prog->len; i++) { 8418 for (i = 0; i < state_htab_size(env); i++) {
8134 sl = env->explored_states[i]; 8419 sl = env->explored_states[i];
8135 8420
8136 if (sl) 8421 while (sl) {
8137 while (sl != STATE_LIST_MARK) { 8422 sln = sl->next;
8138 sln = sl->next; 8423 free_verifier_state(&sl->state, false);
8139 free_verifier_state(&sl->state, false); 8424 kfree(sl);
8140 kfree(sl); 8425 sl = sln;
8141 sl = sln; 8426 }
8142 }
8143 } 8427 }
8144 8428
8145 kvfree(env->explored_states); 8429 kvfree(env->explored_states);
@@ -8239,7 +8523,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
8239 goto skip_full_check; 8523 goto skip_full_check;
8240 } 8524 }
8241 8525
8242 env->explored_states = kvcalloc(env->prog->len, 8526 env->explored_states = kvcalloc(state_htab_size(env),
8243 sizeof(struct bpf_verifier_state_list *), 8527 sizeof(struct bpf_verifier_state_list *),
8244 GFP_USER); 8528 GFP_USER);
8245 ret = -ENOMEM; 8529 ret = -ENOMEM;
@@ -8294,6 +8578,15 @@ skip_full_check:
8294 if (ret == 0) 8578 if (ret == 0)
8295 ret = fixup_bpf_calls(env); 8579 ret = fixup_bpf_calls(env);
8296 8580
8581 /* do 32-bit optimization after insn patching has done so those patched
8582 * insns could be handled correctly.
8583 */
8584 if (ret == 0 && !bpf_prog_is_dev_bound(env->prog->aux)) {
8585 ret = opt_subreg_zext_lo32_rnd_hi32(env, attr);
8586 env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret
8587 : false;
8588 }
8589
8297 if (ret == 0) 8590 if (ret == 0)
8298 ret = fixup_call_args(env); 8591 ret = fixup_call_args(env);
8299 8592
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index 686d244e798d..22066c28ba61 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -37,13 +37,9 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
37 37
38 cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *); 38 cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *);
39 cost += sizeof(struct list_head) * num_possible_cpus(); 39 cost += sizeof(struct list_head) * num_possible_cpus();
40 if (cost >= U32_MAX - PAGE_SIZE)
41 goto free_m;
42
43 m->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
44 40
45 /* Notice returns -EPERM on if map size is larger than memlock limit */ 41 /* Notice returns -EPERM on if map size is larger than memlock limit */
46 err = bpf_map_precharge_memlock(m->map.pages); 42 err = bpf_map_charge_init(&m->map.memory, cost);
47 if (err) 43 if (err)
48 goto free_m; 44 goto free_m;
49 45
@@ -51,7 +47,7 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
51 47
52 m->flush_list = alloc_percpu(struct list_head); 48 m->flush_list = alloc_percpu(struct list_head);
53 if (!m->flush_list) 49 if (!m->flush_list)
54 goto free_m; 50 goto free_charge;
55 51
56 for_each_possible_cpu(cpu) 52 for_each_possible_cpu(cpu)
57 INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu)); 53 INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu));
@@ -65,6 +61,8 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
65 61
66free_percpu: 62free_percpu:
67 free_percpu(m->flush_list); 63 free_percpu(m->flush_list);
64free_charge:
65 bpf_map_charge_finish(&m->map.memory);
68free_m: 66free_m:
69 kfree(m); 67 kfree(m);
70 return ERR_PTR(err); 68 return ERR_PTR(err);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 217cec4e22c6..ef9cfbfc82a9 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4955,8 +4955,6 @@ static void css_release_work_fn(struct work_struct *work)
4955 if (cgrp->kn) 4955 if (cgrp->kn)
4956 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, 4956 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
4957 NULL); 4957 NULL);
4958
4959 cgroup_bpf_put(cgrp);
4960 } 4958 }
4961 4959
4962 mutex_unlock(&cgroup_mutex); 4960 mutex_unlock(&cgroup_mutex);
@@ -5482,6 +5480,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
5482 5480
5483 cgroup1_check_for_release(parent); 5481 cgroup1_check_for_release(parent);
5484 5482
5483 cgroup_bpf_offline(cgrp);
5484
5485 /* put the base reference */ 5485 /* put the base reference */
5486 percpu_ref_kill(&cgrp->self.refcnt); 5486 percpu_ref_kill(&cgrp->self.refcnt);
5487 5487
@@ -6221,6 +6221,7 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
6221 * Don't use cgroup_get_live(). 6221 * Don't use cgroup_get_live().
6222 */ 6222 */
6223 cgroup_get(sock_cgroup_ptr(skcd)); 6223 cgroup_get(sock_cgroup_ptr(skcd));
6224 cgroup_bpf_get(sock_cgroup_ptr(skcd));
6224 return; 6225 return;
6225 } 6226 }
6226 6227
@@ -6232,6 +6233,7 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
6232 cset = task_css_set(current); 6233 cset = task_css_set(current);
6233 if (likely(cgroup_tryget(cset->dfl_cgrp))) { 6234 if (likely(cgroup_tryget(cset->dfl_cgrp))) {
6234 skcd->val = (unsigned long)cset->dfl_cgrp; 6235 skcd->val = (unsigned long)cset->dfl_cgrp;
6236 cgroup_bpf_get(cset->dfl_cgrp);
6235 break; 6237 break;
6236 } 6238 }
6237 cpu_relax(); 6239 cpu_relax();
@@ -6242,7 +6244,10 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
6242 6244
6243void cgroup_sk_free(struct sock_cgroup_data *skcd) 6245void cgroup_sk_free(struct sock_cgroup_data *skcd)
6244{ 6246{
6245 cgroup_put(sock_cgroup_ptr(skcd)); 6247 struct cgroup *cgrp = sock_cgroup_ptr(skcd);
6248
6249 cgroup_bpf_put(cgrp);
6250 cgroup_put(cgrp);
6246} 6251}
6247 6252
6248#endif /* CONFIG_SOCK_CGROUP_DATA */ 6253#endif /* CONFIG_SOCK_CGROUP_DATA */
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index f92d6ad5e080..3994a231eb92 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -19,6 +19,9 @@
19#include "trace_probe.h" 19#include "trace_probe.h"
20#include "trace.h" 20#include "trace.h"
21 21
22#define bpf_event_rcu_dereference(p) \
23 rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex))
24
22#ifdef CONFIG_MODULES 25#ifdef CONFIG_MODULES
23struct bpf_trace_module { 26struct bpf_trace_module {
24 struct module *module; 27 struct module *module;
@@ -567,6 +570,69 @@ static const struct bpf_func_proto bpf_probe_read_str_proto = {
567 .arg3_type = ARG_ANYTHING, 570 .arg3_type = ARG_ANYTHING,
568}; 571};
569 572
573struct send_signal_irq_work {
574 struct irq_work irq_work;
575 struct task_struct *task;
576 u32 sig;
577};
578
579static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work);
580
581static void do_bpf_send_signal(struct irq_work *entry)
582{
583 struct send_signal_irq_work *work;
584
585 work = container_of(entry, struct send_signal_irq_work, irq_work);
586 group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, PIDTYPE_TGID);
587}
588
589BPF_CALL_1(bpf_send_signal, u32, sig)
590{
591 struct send_signal_irq_work *work = NULL;
592
593 /* Similar to bpf_probe_write_user, task needs to be
594 * in a sound condition and kernel memory access be
595 * permitted in order to send signal to the current
596 * task.
597 */
598 if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING)))
599 return -EPERM;
600 if (unlikely(uaccess_kernel()))
601 return -EPERM;
602 if (unlikely(!nmi_uaccess_okay()))
603 return -EPERM;
604
605 if (in_nmi()) {
606 /* Do an early check on signal validity. Otherwise,
607 * the error is lost in deferred irq_work.
608 */
609 if (unlikely(!valid_signal(sig)))
610 return -EINVAL;
611
612 work = this_cpu_ptr(&send_signal_work);
613 if (work->irq_work.flags & IRQ_WORK_BUSY)
614 return -EBUSY;
615
616 /* Add the current task, which is the target of sending signal,
617 * to the irq_work. The current task may change when queued
618 * irq works get executed.
619 */
620 work->task = current;
621 work->sig = sig;
622 irq_work_queue(&work->irq_work);
623 return 0;
624 }
625
626 return group_send_sig_info(sig, SEND_SIG_PRIV, current, PIDTYPE_TGID);
627}
628
629static const struct bpf_func_proto bpf_send_signal_proto = {
630 .func = bpf_send_signal,
631 .gpl_only = false,
632 .ret_type = RET_INTEGER,
633 .arg1_type = ARG_ANYTHING,
634};
635
570static const struct bpf_func_proto * 636static const struct bpf_func_proto *
571tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 637tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
572{ 638{
@@ -617,6 +683,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
617 case BPF_FUNC_get_current_cgroup_id: 683 case BPF_FUNC_get_current_cgroup_id:
618 return &bpf_get_current_cgroup_id_proto; 684 return &bpf_get_current_cgroup_id_proto;
619#endif 685#endif
686 case BPF_FUNC_send_signal:
687 return &bpf_send_signal_proto;
620 default: 688 default:
621 return NULL; 689 return NULL;
622 } 690 }
@@ -1034,7 +1102,7 @@ static DEFINE_MUTEX(bpf_event_mutex);
1034int perf_event_attach_bpf_prog(struct perf_event *event, 1102int perf_event_attach_bpf_prog(struct perf_event *event,
1035 struct bpf_prog *prog) 1103 struct bpf_prog *prog)
1036{ 1104{
1037 struct bpf_prog_array __rcu *old_array; 1105 struct bpf_prog_array *old_array;
1038 struct bpf_prog_array *new_array; 1106 struct bpf_prog_array *new_array;
1039 int ret = -EEXIST; 1107 int ret = -EEXIST;
1040 1108
@@ -1052,7 +1120,7 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
1052 if (event->prog) 1120 if (event->prog)
1053 goto unlock; 1121 goto unlock;
1054 1122
1055 old_array = event->tp_event->prog_array; 1123 old_array = bpf_event_rcu_dereference(event->tp_event->prog_array);
1056 if (old_array && 1124 if (old_array &&
1057 bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) { 1125 bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) {
1058 ret = -E2BIG; 1126 ret = -E2BIG;
@@ -1075,7 +1143,7 @@ unlock:
1075 1143
1076void perf_event_detach_bpf_prog(struct perf_event *event) 1144void perf_event_detach_bpf_prog(struct perf_event *event)
1077{ 1145{
1078 struct bpf_prog_array __rcu *old_array; 1146 struct bpf_prog_array *old_array;
1079 struct bpf_prog_array *new_array; 1147 struct bpf_prog_array *new_array;
1080 int ret; 1148 int ret;
1081 1149
@@ -1084,7 +1152,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
1084 if (!event->prog) 1152 if (!event->prog)
1085 goto unlock; 1153 goto unlock;
1086 1154
1087 old_array = event->tp_event->prog_array; 1155 old_array = bpf_event_rcu_dereference(event->tp_event->prog_array);
1088 ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array); 1156 ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array);
1089 if (ret == -ENOENT) 1157 if (ret == -ENOENT)
1090 goto unlock; 1158 goto unlock;
@@ -1106,6 +1174,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info)
1106{ 1174{
1107 struct perf_event_query_bpf __user *uquery = info; 1175 struct perf_event_query_bpf __user *uquery = info;
1108 struct perf_event_query_bpf query = {}; 1176 struct perf_event_query_bpf query = {};
1177 struct bpf_prog_array *progs;
1109 u32 *ids, prog_cnt, ids_len; 1178 u32 *ids, prog_cnt, ids_len;
1110 int ret; 1179 int ret;
1111 1180
@@ -1130,10 +1199,8 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info)
1130 */ 1199 */
1131 1200
1132 mutex_lock(&bpf_event_mutex); 1201 mutex_lock(&bpf_event_mutex);
1133 ret = bpf_prog_array_copy_info(event->tp_event->prog_array, 1202 progs = bpf_event_rcu_dereference(event->tp_event->prog_array);
1134 ids, 1203 ret = bpf_prog_array_copy_info(progs, ids, ids_len, &prog_cnt);
1135 ids_len,
1136 &prog_cnt);
1137 mutex_unlock(&bpf_event_mutex); 1204 mutex_unlock(&bpf_event_mutex);
1138 1205
1139 if (copy_to_user(&uquery->prog_cnt, &prog_cnt, sizeof(prog_cnt)) || 1206 if (copy_to_user(&uquery->prog_cnt, &prog_cnt, sizeof(prog_cnt)) ||
@@ -1343,5 +1410,18 @@ static int __init bpf_event_init(void)
1343 return 0; 1410 return 0;
1344} 1411}
1345 1412
1413static int __init send_signal_irq_work_init(void)
1414{
1415 int cpu;
1416 struct send_signal_irq_work *work;
1417
1418 for_each_possible_cpu(cpu) {
1419 work = per_cpu_ptr(&send_signal_work, cpu);
1420 init_irq_work(&work->irq_work, do_bpf_send_signal);
1421 }
1422 return 0;
1423}
1424
1346fs_initcall(bpf_event_init); 1425fs_initcall(bpf_event_init);
1426subsys_initcall(send_signal_irq_work_init);
1347#endif /* CONFIG_MODULES */ 1427#endif /* CONFIG_MODULES */