5 files changed, 126 insertions, 17 deletions
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index cb3c6b3b89c8..b631ee75762d 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -6,6 +6,7 @@
 #include <linux/errno.h>
 #include <linux/jump_label.h>
 #include <linux/percpu.h>
+#include <linux/percpu-refcount.h>
 #include <linux/rbtree.h>
 #include <uapi/linux/bpf.h>
@@ -71,11 +72,17 @@ struct cgroup_bpf {
        u32 flags[MAX_BPF_ATTACH_TYPE];
        /* temp storage for effective prog array used by prog_attach/detach */
-        struct bpf_prog_array __rcu *inactive;
+        struct bpf_prog_array *inactive;
+        /* reference counter used to detach bpf programs after cgroup removal */
+        struct percpu_ref refcnt;
+        /* cgroup_bpf is released using a work queue */
+        struct work_struct release_work;
 };
-void cgroup_bpf_put(struct cgroup *cgrp);
 int cgroup_bpf_inherit(struct cgroup *cgrp);
+void cgroup_bpf_offline(struct cgroup *cgrp);
 int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
                        enum bpf_attach_type type, u32 flags);
@@ -283,8 +290,8 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
 struct bpf_prog;
 struct cgroup_bpf {};
-static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
 static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
+static inline void cgroup_bpf_offline(struct cgroup *cgrp) {}
 static inline int cgroup_bpf_prog_attach(const union bpf_attr *attr,
                                         enum bpf_prog_type ptype,
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4fb3aa2dc975..e5a309e6a400 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -66,6 +66,11 @@ struct bpf_map_ops {
                                     u64 imm, u32 *off);
 };
+struct bpf_map_memory {
+        u32 pages;
+        struct user_struct *user;
+};
 struct bpf_map {
        /* The first two cachelines with read-mostly members of which some
         * are also accessed in fast-path (e.g. ops, max_entries).
@@ -86,7 +91,7 @@ struct bpf_map {
        u32 btf_key_type_id;
        u32 btf_value_type_id;
        struct btf *btf;
-        u32 pages;
+        struct bpf_map_memory memory;
        bool unpriv_array;
        bool frozen; /* write-once */
        /* 48 bytes hole */
@@ -94,8 +99,7 @@ struct bpf_map {
        /* The 3rd and 4th cacheline with misc members to avoid false sharing
         * particularly with refcounting.
         */
-        struct user_struct *user ____cacheline_aligned;
+        atomic_t refcnt ____cacheline_aligned;
-        atomic_t refcnt;
        atomic_t usercnt;
        struct work_struct work;
        char name[BPF_OBJ_NAME_LEN];
@@ -370,6 +374,7 @@ struct bpf_prog_aux {
        u32 id;
        u32 func_cnt; /* used by non-func prog as the number of func progs */
        u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */
+        bool verifier_zext; /* Zero extensions has been inserted by verifier. */
        bool offload_requested;
        struct bpf_prog **func;
        void *jit_data; /* JIT specific data. arch dependent */
@@ -513,17 +518,17 @@ struct bpf_prog_array {
 };
 struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
-void bpf_prog_array_free(struct bpf_prog_array __rcu *progs);
+void bpf_prog_array_free(struct bpf_prog_array *progs);
-int bpf_prog_array_length(struct bpf_prog_array __rcu *progs);
+int bpf_prog_array_length(struct bpf_prog_array *progs);
-int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
+int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs,
                                __u32 __user *prog_ids, u32 cnt);
-void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs,
+void bpf_prog_array_delete_safe(struct bpf_prog_array *progs,
                                struct bpf_prog *old_prog);
-int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array,
+int bpf_prog_array_copy_info(struct bpf_prog_array *array,
                             u32 *prog_ids, u32 request_cnt,
                             u32 *prog_cnt);
-int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
+int bpf_prog_array_copy(struct bpf_prog_array *old_array,
                        struct bpf_prog *exclude_prog,
                        struct bpf_prog *include_prog,
                        struct bpf_prog_array **new_array);
@@ -551,6 +556,56 @@ _out:							\
                _ret;                                   \
         })
+/* To be used by __cgroup_bpf_run_filter_skb for EGRESS BPF progs
+ * so BPF programs can request cwr for TCP packets.
+ *
+ * Current cgroup skb programs can only return 0 or 1 (0 to drop the
+ * packet. This macro changes the behavior so the low order bit
+ * indicates whether the packet should be dropped (0) or not (1)
+ * and the next bit is a congestion notification bit. This could be
+ * used by TCP to call tcp_enter_cwr()
+ *
+ * Hence, new allowed return values of CGROUP EGRESS BPF programs are:
+ *   0: drop packet
+ *   1: keep packet
+ *   2: drop packet and cn
+ *   3: keep packet and cn
+ *
+ * This macro then converts it to one of the NET_XMIT or an error
+ * code that is then interpreted as drop packet (and no cn):
+ *   0: NET_XMIT_SUCCESS  skb should be transmitted
+ *   1: NET_XMIT_DROP     skb should be dropped and cn
+ *   2: NET_XMIT_CN       skb should be transmitted and cn
+ *   3: -EPERM            skb should be dropped
+ */
+#define BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(array, ctx, func)         \
+        ({                                              \
+                struct bpf_prog_array_item *_item;      \
+                struct bpf_prog *_prog;                 \
+                struct bpf_prog_array *_array;          \
+                u32 ret;                                \
+                u32 _ret = 1;                           \
+                u32 _cn = 0;                            \
+                preempt_disable();                      \
+                rcu_read_lock();                        \
+                _array = rcu_dereference(array);        \
+                _item = &_array->items[0];              \
+                while ((_prog = READ_ONCE(_item->prog))) {              \
+                        bpf_cgroup_storage_set(_item->cgroup_storage);  \
+                        ret = func(_prog, ctx);         \
+                        _ret &= (ret & 1);              \
+                        _cn |= (ret & 2);               \
+                        _item++;                        \
+                }                                       \
+                rcu_read_unlock();                      \
+                preempt_enable();                       \
+                if (_ret)                               \
+                        _ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS);  \
+                else                                    \
+                        _ret = (_cn ? NET_XMIT_DROP : -EPERM);          \
+                _ret;                                   \
+        })
 #define BPF_PROG_RUN_ARRAY(array, ctx, func)            \
        __BPF_PROG_RUN_ARRAY(array, ctx, func, false)
@@ -595,9 +650,12 @@ struct bpf_map *__bpf_map_get(struct fd f);
 struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref);
 void bpf_map_put_with_uref(struct bpf_map *map);
 void bpf_map_put(struct bpf_map *map);
-int bpf_map_precharge_memlock(u32 pages);
 int bpf_map_charge_memlock(struct bpf_map *map, u32 pages);
 void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages);
+int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size);
+void bpf_map_charge_finish(struct bpf_map_memory *mem);
+void bpf_map_charge_move(struct bpf_map_memory *dst,
+                         struct bpf_map_memory *src);
 void *bpf_map_area_alloc(size_t size, int numa_node);
 void bpf_map_area_free(void *base);
 void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 1305ccbd8fe6..704ed7971472 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -36,9 +36,11 @@
 */
 enum bpf_reg_liveness {
        REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */
-        REG_LIVE_READ, /* reg was read, so we're sensitive to initial value */
+        REG_LIVE_READ32 = 0x1, /* reg was read, so we're sensitive to initial value */
-        REG_LIVE_WRITTEN, /* reg was written first, screening off later reads */
+        REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */
-        REG_LIVE_DONE = 4, /* liveness won't be updating this register anymore */
+        REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64,
+        REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later reads */
+        REG_LIVE_DONE = 0x8, /* liveness won't be updating this register anymore */
 };
 struct bpf_reg_state {
@@ -131,6 +133,11 @@ struct bpf_reg_state {
         * pointing to bpf_func_state.
         */
        u32 frameno;
+        /* Tracks subreg definition. The stored value is the insn_idx of the
+         * writing insn. This is safe because subreg_def is used before any insn
+         * patching which only happens after main verification finished.
+         */
+        s32 subreg_def;
        enum bpf_reg_liveness live;
 };
@@ -187,6 +194,7 @@ struct bpf_func_state {
 struct bpf_verifier_state {
        /* call stack tracking */
        struct bpf_func_state *frame[MAX_CALL_FRAMES];
+        u32 insn_idx;
        u32 curframe;
        u32 active_spin_lock;
        bool speculative;
@@ -232,7 +240,9 @@ struct bpf_insn_aux_data {
        int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
        int sanitize_stack_off; /* stack slot to be cleared */
        bool seen; /* this insn was processed by the verifier */
+        bool zext_dst; /* this insn zero extends dst reg */
        u8 alu_state; /* used in combination with alu_limit */
+        bool prune_point;
        unsigned int orig_idx; /* original instruction index */
 };
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c0077adeea83..49e8facf7c4a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -924,4 +924,22 @@ static inline bool cgroup_task_frozen(struct task_struct *task)
 #endif /* !CONFIG_CGROUPS */
+#ifdef CONFIG_CGROUP_BPF
+static inline void cgroup_bpf_get(struct cgroup *cgrp)
+{
+        percpu_ref_get(&cgrp->bpf.refcnt);
+}
+static inline void cgroup_bpf_put(struct cgroup *cgrp)
+{
+        percpu_ref_put(&cgrp->bpf.refcnt);
+}
+#else /* CONFIG_CGROUP_BPF */
+static inline void cgroup_bpf_get(struct cgroup *cgrp) {}
+static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
+#endif /* CONFIG_CGROUP_BPF */
 #endif /* _LINUX_CGROUP_H */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 7148bab96943..43b45d6db36d 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -160,6 +160,20 @@ struct ctl_table_header;
                .off   = 0,                                     \
                .imm   = IMM })
+/* Special form of mov32, used for doing explicit zero extension on dst. */
+#define BPF_ZEXT_REG(DST)                                       \
+        ((struct bpf_insn) {                                    \
+                .code  = BPF_ALU | BPF_MOV | BPF_X,             \
+                .dst_reg = DST,                                 \
+                .src_reg = DST,                                 \
+                .off   = 0,                                     \
+                .imm   = 1 })
+static inline bool insn_is_zext(const struct bpf_insn *insn)
+{
+        return insn->code == (BPF_ALU | BPF_MOV | BPF_X) && insn->imm == 1;
+}
 /* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */
 #define BPF_LD_IMM64(DST, IMM)                                  \
        BPF_LD_IMM64_RAW(DST, 0, IMM)
@@ -512,7 +526,8 @@ struct bpf_prog {
                                blinded:1,      /* Was blinded */
                                is_func:1,      /* program is a bpf function */
                                kprobe_override:1, /* Do we override a kprobe? */
-                                has_callchain_buf:1; /* callchain buffer allocated? */
+                                has_callchain_buf:1, /* callchain buffer allocated? */
+                                enforce_expected_attach_type:1; /* Enforce expected_attach_type checking at attach time */
        enum bpf_prog_type      type;           /* Type of BPF program */
        enum bpf_attach_type    expected_attach_type; /* For some prog types */
        u32                     len;            /* Number of filter blocks */
@@ -811,6 +826,7 @@ u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
 void bpf_jit_compile(struct bpf_prog *prog);
+bool bpf_jit_needs_zext(void);
 bool bpf_helper_changes_pkt_data(void *func);
 static inline bool bpf_dump_raw_ok(void)