1 files changed, 234 insertions, 174 deletions
diff --git a/net/core/flow.c b/net/core/flow.c
index 96015871ecea..f67dcbfe54ef 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -26,113 +26,159 @@
 #include <linux/security.h>
 struct flow_cache_entry {
-        struct flow_cache_entry *next;
+        union {
-        u16                     family;
+                struct hlist_node       hlist;
-        u8                      dir;
+                struct list_head        gc_list;
-        u32                     genid;
+        } u;
-        struct flowi            key;
+        u16                             family;
-        void                    *object;
+        u8                              dir;
-        atomic_t                *object_ref;
+        u32                             genid;
+        struct flowi                    key;
+        struct flow_cache_object        *object;
 };
-atomic_t flow_cache_genid = ATOMIC_INIT(0);
+struct flow_cache_percpu {
+        struct hlist_head               *hash_table;
-static u32 flow_hash_shift;
+        int                             hash_count;
-#define flow_hash_size  (1 << flow_hash_shift)
+        u32                             hash_rnd;
-static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL };
+        int                             hash_rnd_recalc;
+        struct tasklet_struct           flush_tasklet;
-#define flow_table(cpu) (per_cpu(flow_tables, cpu))
-static struct kmem_cache *flow_cachep __read_mostly;
-static int flow_lwm, flow_hwm;
-struct flow_percpu_info {
-        int hash_rnd_recalc;
-        u32 hash_rnd;
-        int count;
 };
-static DEFINE_PER_CPU(struct flow_percpu_info, flow_hash_info) = { 0 };
-#define flow_hash_rnd_recalc(cpu) \
+struct flow_flush_info {
-        (per_cpu(flow_hash_info, cpu).hash_rnd_recalc)
+        struct flow_cache               *cache;
-#define flow_hash_rnd(cpu) \
+        atomic_t                        cpuleft;
-        (per_cpu(flow_hash_info, cpu).hash_rnd)
+        struct completion               completion;
-#define flow_count(cpu) \
+};
-        (per_cpu(flow_hash_info, cpu).count)
-static struct timer_list flow_hash_rnd_timer;
+struct flow_cache {
+        u32                             hash_shift;
+        unsigned long                   order;
+        struct flow_cache_percpu        *percpu;
+        struct notifier_block           hotcpu_notifier;
+        int                             low_watermark;
+        int                             high_watermark;
+        struct timer_list               rnd_timer;
+};
-#define FLOW_HASH_RND_PERIOD    (10 * 60 * HZ)
+atomic_t flow_cache_genid = ATOMIC_INIT(0);
+EXPORT_SYMBOL(flow_cache_genid);
+static struct flow_cache flow_cache_global;
+static struct kmem_cache *flow_cachep;
-struct flow_flush_info {
+static DEFINE_SPINLOCK(flow_cache_gc_lock);
-        atomic_t cpuleft;
+static LIST_HEAD(flow_cache_gc_list);
-        struct completion completion;
-};
-static DEFINE_PER_CPU(struct tasklet_struct, flow_flush_tasklets) = { NULL };
-#define flow_flush_tasklet(cpu) (&per_cpu(flow_flush_tasklets, cpu))
+#define flow_cache_hash_size(cache)     (1 << (cache)->hash_shift)
+#define FLOW_HASH_RND_PERIOD            (10 * 60 * HZ)
 static void flow_cache_new_hashrnd(unsigned long arg)
 {
+        struct flow_cache *fc = (void *) arg;
        int i;
        for_each_possible_cpu(i)
-                flow_hash_rnd_recalc(i) = 1;
+                per_cpu_ptr(fc->percpu, i)->hash_rnd_recalc = 1;
+        fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
+        add_timer(&fc->rnd_timer);
+}
-        flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
+static int flow_entry_valid(struct flow_cache_entry *fle)
-        add_timer(&flow_hash_rnd_timer);
+{
+        if (atomic_read(&flow_cache_genid) != fle->genid)
+                return 0;
+        if (fle->object && !fle->object->ops->check(fle->object))
+                return 0;
+        return 1;
 }
-static void flow_entry_kill(int cpu, struct flow_cache_entry *fle)
+static void flow_entry_kill(struct flow_cache_entry *fle)
 {
        if (fle->object)
-                atomic_dec(fle->object_ref);
+                fle->object->ops->delete(fle->object);
        kmem_cache_free(flow_cachep, fle);
-        flow_count(cpu)--;
 }
-static void __flow_cache_shrink(int cpu, int shrink_to)
+static void flow_cache_gc_task(struct work_struct *work)
 {
-        struct flow_cache_entry *fle, **flp;
+        struct list_head gc_list;
-        int i;
+        struct flow_cache_entry *fce, *n;
-        for (i = 0; i < flow_hash_size; i++) {
+        INIT_LIST_HEAD(&gc_list);
-                int k = 0;
+        spin_lock_bh(&flow_cache_gc_lock);
+        list_splice_tail_init(&flow_cache_gc_list, &gc_list);
+        spin_unlock_bh(&flow_cache_gc_lock);
-                flp = &flow_table(cpu)[i];
+        list_for_each_entry_safe(fce, n, &gc_list, u.gc_list)
-                while ((fle = *flp) != NULL && k < shrink_to) {
+                flow_entry_kill(fce);
-                        k++;
+}
-                        flp = &fle->next;
+static DECLARE_WORK(flow_cache_gc_work, flow_cache_gc_task);
-                }
-                while ((fle = *flp) != NULL) {
+static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
-                        *flp = fle->next;
+                                     int deleted, struct list_head *gc_list)
-                        flow_entry_kill(cpu, fle);
+{
-                }
+        if (deleted) {
+                fcp->hash_count -= deleted;
+                spin_lock_bh(&flow_cache_gc_lock);
+                list_splice_tail(gc_list, &flow_cache_gc_list);
+                spin_unlock_bh(&flow_cache_gc_lock);
+                schedule_work(&flow_cache_gc_work);
        }
 }
-static void flow_cache_shrink(int cpu)
+static void __flow_cache_shrink(struct flow_cache *fc,
+                                struct flow_cache_percpu *fcp,
+                                int shrink_to)
 {
-        int shrink_to = flow_lwm / flow_hash_size;
+        struct flow_cache_entry *fle;
+        struct hlist_node *entry, *tmp;
+        LIST_HEAD(gc_list);
+        int i, deleted = 0;
+        for (i = 0; i < flow_cache_hash_size(fc); i++) {
+                int saved = 0;
+                hlist_for_each_entry_safe(fle, entry, tmp,
+                                          &fcp->hash_table[i], u.hlist) {
+                        if (saved < shrink_to &&
+                            flow_entry_valid(fle)) {
+                                saved++;
+                        } else {
+                                deleted++;
+                                hlist_del(&fle->u.hlist);
+                                list_add_tail(&fle->u.gc_list, &gc_list);
+                        }
+                }
+        }
-        __flow_cache_shrink(cpu, shrink_to);
+        flow_cache_queue_garbage(fcp, deleted, &gc_list);
 }
-static void flow_new_hash_rnd(int cpu)
+static void flow_cache_shrink(struct flow_cache *fc,
+                              struct flow_cache_percpu *fcp)
 {
-        get_random_bytes(&flow_hash_rnd(cpu), sizeof(u32));
+        int shrink_to = fc->low_watermark / flow_cache_hash_size(fc);
-        flow_hash_rnd_recalc(cpu) = 0;
-        __flow_cache_shrink(cpu, 0);
+        __flow_cache_shrink(fc, fcp, shrink_to);
 }
-static u32 flow_hash_code(struct flowi *key, int cpu)
+static void flow_new_hash_rnd(struct flow_cache *fc,
+                              struct flow_cache_percpu *fcp)
+{
+        get_random_bytes(&fcp->hash_rnd, sizeof(u32));
+        fcp->hash_rnd_recalc = 0;
+        __flow_cache_shrink(fc, fcp, 0);
+}
+static u32 flow_hash_code(struct flow_cache *fc,
+                          struct flow_cache_percpu *fcp,
+                          struct flowi *key)
 {
        u32 *k = (u32 *) key;
-        return (jhash2(k, (sizeof(*key) / sizeof(u32)), flow_hash_rnd(cpu)) &
+        return (jhash2(k, (sizeof(*key) / sizeof(u32)), fcp->hash_rnd)
-                (flow_hash_size - 1));
+                & (flow_cache_hash_size(fc) - 1));
 }
 #if (BITS_PER_LONG == 64)
@@ -165,114 +211,118 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2)
        return 0;
 }
-void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
+struct flow_cache_object *
-                        flow_resolve_t resolver)
+flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
+                  flow_resolve_t resolver, void *ctx)
 {
-        struct flow_cache_entry *fle, **head;
+        struct flow_cache *fc = &flow_cache_global;
+        struct flow_cache_percpu *fcp;
+        struct flow_cache_entry *fle, *tfle;
+        struct hlist_node *entry;
+        struct flow_cache_object *flo;
        unsigned int hash;
-        int cpu;
        local_bh_disable();
-        cpu = smp_processor_id();
+        fcp = this_cpu_ptr(fc->percpu);
        fle = NULL;
+        flo = NULL;
        /* Packet really early in init?  Making flow_cache_init a
         * pre-smp initcall would solve this.  --RR */
-        if (!flow_table(cpu))
+        if (!fcp->hash_table)
                goto nocache;
-        if (flow_hash_rnd_recalc(cpu))
+        if (fcp->hash_rnd_recalc)
-                flow_new_hash_rnd(cpu);
+                flow_new_hash_rnd(fc, fcp);
-        hash = flow_hash_code(key, cpu);
-        head = &flow_table(cpu)[hash];
+        hash = flow_hash_code(fc, fcp, key);
-        for (fle = *head; fle; fle = fle->next) {
+        hlist_for_each_entry(tfle, entry, &fcp->hash_table[hash], u.hlist) {
-                if (fle->family == family &&
+                if (tfle->family == family &&
-                    fle->dir == dir &&
+                    tfle->dir == dir &&
-                    flow_key_compare(key, &fle->key) == 0) {
+                    flow_key_compare(key, &tfle->key) == 0) {
-                        if (fle->genid == atomic_read(&flow_cache_genid)) {
+                        fle = tfle;
-                                void *ret = fle->object;
-                                if (ret)
-                                        atomic_inc(fle->object_ref);
-                                local_bh_enable();
-                                return ret;
-                        }
                        break;
                }
        }
-        if (!fle) {
+        if (unlikely(!fle)) {
-                if (flow_count(cpu) > flow_hwm)
+                if (fcp->hash_count > fc->high_watermark)
-                        flow_cache_shrink(cpu);
+                        flow_cache_shrink(fc, fcp);
                fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
                if (fle) {
-                        fle->next = *head;
-                        *head = fle;
                        fle->family = family;
                        fle->dir = dir;
                        memcpy(&fle->key, key, sizeof(*key));
                        fle->object = NULL;
-                        flow_count(cpu)++;
+                        hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]);
+                        fcp->hash_count++;
                }
+        } else if (likely(fle->genid == atomic_read(&flow_cache_genid))) {
+                flo = fle->object;
+                if (!flo)
+                        goto ret_object;
+                flo = flo->ops->get(flo);
+                if (flo)
+                        goto ret_object;
+        } else if (fle->object) {
+                flo = fle->object;
+                flo->ops->delete(flo);
+                fle->object = NULL;
        }
 nocache:
-        {
+        flo = NULL;
-                int err;
+        if (fle) {
-                void *obj;
+                flo = fle->object;
-                atomic_t *obj_ref;
+                fle->object = NULL;
-                err = resolver(net, key, family, dir, &obj, &obj_ref);
-                if (fle && !err) {
-                        fle->genid = atomic_read(&flow_cache_genid);
-                        if (fle->object)
-                                atomic_dec(fle->object_ref);
-                        fle->object = obj;
-                        fle->object_ref = obj_ref;
-                        if (obj)
-                                atomic_inc(fle->object_ref);
-                }
-                local_bh_enable();
-                if (err)
-                        obj = ERR_PTR(err);
-                return obj;
        }
+        flo = resolver(net, key, family, dir, flo, ctx);
+        if (fle) {
+                fle->genid = atomic_read(&flow_cache_genid);
+                if (!IS_ERR(flo))
+                        fle->object = flo;
+                else
+                        fle->genid--;
+        } else {
+                if (flo && !IS_ERR(flo))
+                        flo->ops->delete(flo);
+        }
+ret_object:
+        local_bh_enable();
+        return flo;
 }
+EXPORT_SYMBOL(flow_cache_lookup);
 static void flow_cache_flush_tasklet(unsigned long data)
 {
        struct flow_flush_info *info = (void *)data;
-        int i;
+        struct flow_cache *fc = info->cache;
-        int cpu;
+        struct flow_cache_percpu *fcp;
+        struct flow_cache_entry *fle;
-        cpu = smp_processor_id();
+        struct hlist_node *entry, *tmp;
-        for (i = 0; i < flow_hash_size; i++) {
+        LIST_HEAD(gc_list);
-                struct flow_cache_entry *fle;
+        int i, deleted = 0;
-                fle = flow_table(cpu)[i];
+        fcp = this_cpu_ptr(fc->percpu);
-                for (; fle; fle = fle->next) {
+        for (i = 0; i < flow_cache_hash_size(fc); i++) {
-                        unsigned genid = atomic_read(&flow_cache_genid);
+                hlist_for_each_entry_safe(fle, entry, tmp,
+                                          &fcp->hash_table[i], u.hlist) {
-                        if (!fle->object || fle->genid == genid)
+                        if (flow_entry_valid(fle))
                                continue;
-                        fle->object = NULL;
+                        deleted++;
-                        atomic_dec(fle->object_ref);
+                        hlist_del(&fle->u.hlist);
+                        list_add_tail(&fle->u.gc_list, &gc_list);
                }
        }
+        flow_cache_queue_garbage(fcp, deleted, &gc_list);
        if (atomic_dec_and_test(&info->cpuleft))
                complete(&info->completion);
 }
-static void flow_cache_flush_per_cpu(void *) __attribute__((__unused__));
 static void flow_cache_flush_per_cpu(void *data)
 {
        struct flow_flush_info *info = data;
@@ -280,8 +330,7 @@ static void flow_cache_flush_per_cpu(void *data)
        struct tasklet_struct *tasklet;
        cpu = smp_processor_id();
+        tasklet = &per_cpu_ptr(info->cache->percpu, cpu)->flush_tasklet;
-        tasklet = flow_flush_tasklet(cpu);
        tasklet->data = (unsigned long)info;
        tasklet_schedule(tasklet);
 }
@@ -294,6 +343,7 @@ void flow_cache_flush(void)
        /* Don't want cpus going down or up during this. */
        get_online_cpus();
        mutex_lock(&flow_flush_sem);
+        info.cache = &flow_cache_global;
        atomic_set(&info.cpuleft, num_online_cpus());
        init_completion(&info.completion);
@@ -307,62 +357,72 @@ void flow_cache_flush(void)
        put_online_cpus();
 }
-static void __init flow_cache_cpu_prepare(int cpu)
+static void __init flow_cache_cpu_prepare(struct flow_cache *fc,
+                                          struct flow_cache_percpu *fcp)
 {
-        struct tasklet_struct *tasklet;
+        fcp->hash_table = (struct hlist_head *)
-        unsigned long order;
+                __get_free_pages(GFP_KERNEL|__GFP_ZERO, fc->order);
+        if (!fcp->hash_table)
-        for (order = 0;
+                panic("NET: failed to allocate flow cache order %lu\n", fc->order);
-             (PAGE_SIZE << order) <
-                     (sizeof(struct flow_cache_entry *)*flow_hash_size);
+        fcp->hash_rnd_recalc = 1;
-             order++)
+        fcp->hash_count = 0;
-                /* NOTHING */;
+        tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0);
-        flow_table(cpu) = (struct flow_cache_entry **)
-                __get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
-        if (!flow_table(cpu))
-                panic("NET: failed to allocate flow cache order %lu\n", order);
-        flow_hash_rnd_recalc(cpu) = 1;
-        flow_count(cpu) = 0;
-        tasklet = flow_flush_tasklet(cpu);
-        tasklet_init(tasklet, flow_cache_flush_tasklet, 0);
 }
 static int flow_cache_cpu(struct notifier_block *nfb,
                          unsigned long action,
                          void *hcpu)
 {
+        struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier);
+        int cpu = (unsigned long) hcpu;
+        struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
-                __flow_cache_shrink((unsigned long)hcpu, 0);
+                __flow_cache_shrink(fc, fcp, 0);
        return NOTIFY_OK;
 }
-static int __init flow_cache_init(void)
+static int flow_cache_init(struct flow_cache *fc)
 {
+        unsigned long order;
        int i;
-        flow_cachep = kmem_cache_create("flow_cache",
+        fc->hash_shift = 10;
-                                        sizeof(struct flow_cache_entry),
+        fc->low_watermark = 2 * flow_cache_hash_size(fc);
-                                        0, SLAB_PANIC,
+        fc->high_watermark = 4 * flow_cache_hash_size(fc);
-                                        NULL);
-        flow_hash_shift = 10;
-        flow_lwm = 2 * flow_hash_size;
-        flow_hwm = 4 * flow_hash_size;
-        setup_timer(&flow_hash_rnd_timer, flow_cache_new_hashrnd, 0);
+        for (order = 0;
-        flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
+             (PAGE_SIZE << order) <
-        add_timer(&flow_hash_rnd_timer);
+                     (sizeof(struct hlist_head)*flow_cache_hash_size(fc));
+             order++)
+                /* NOTHING */;
+        fc->order = order;
+        fc->percpu = alloc_percpu(struct flow_cache_percpu);
+        setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
+                    (unsigned long) fc);
+        fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
+        add_timer(&fc->rnd_timer);
        for_each_possible_cpu(i)
-                flow_cache_cpu_prepare(i);
+                flow_cache_cpu_prepare(fc, per_cpu_ptr(fc->percpu, i));
+        fc->hotcpu_notifier = (struct notifier_block){
+                .notifier_call = flow_cache_cpu,
+        };
+        register_hotcpu_notifier(&fc->hotcpu_notifier);
-        hotcpu_notifier(flow_cache_cpu, 0);
        return 0;
 }
-module_init(flow_cache_init);
+static int __init flow_cache_init_global(void)
+{
+        flow_cachep = kmem_cache_create("flow_cache",
+                                        sizeof(struct flow_cache_entry),
+                                        0, SLAB_PANIC, NULL);
-EXPORT_SYMBOL(flow_cache_genid);
+        return flow_cache_init(&flow_cache_global);
-EXPORT_SYMBOL(flow_cache_lookup);
+}
+module_init(flow_cache_init_global);

diff --git a/net/core/flow.c b/net/core/flow.c index 96015871ecea..f67dcbfe54ef 100644 --- a/net/core/flow.c +++ b/net/core/flow.c
@@ -26,113 +26,159 @@
26	#include <linux/security.h>	26	#include <linux/security.h>
27		27
28	struct flow_cache_entry {	28	struct flow_cache_entry {
29	struct flow_cache_entry *next;	29	union {
30	u16 family;	30	struct hlist_node hlist;
31	u8 dir;	31	struct list_head gc_list;
32	u32 genid;	32	} u;
33	struct flowi key;	33	u16 family;
34	void *object;	34	u8 dir;
35	atomic_t *object_ref;	35	u32 genid;
		36	struct flowi key;
		37	struct flow_cache_object *object;
36	};	38	};
37		39
38	atomic_t flow_cache_genid = ATOMIC_INIT(0);	40	struct flow_cache_percpu {
39		41	struct hlist_head *hash_table;
40	static u32 flow_hash_shift;	42	int hash_count;
41	#define flow_hash_size (1 << flow_hash_shift)	43	u32 hash_rnd;
42	static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL };	44	int hash_rnd_recalc;
43		45	struct tasklet_struct flush_tasklet;
44	#define flow_table(cpu) (per_cpu(flow_tables, cpu))
45
46	static struct kmem_cache *flow_cachep __read_mostly;
47
48	static int flow_lwm, flow_hwm;
49
50	struct flow_percpu_info {
51	int hash_rnd_recalc;
52	u32 hash_rnd;
53	int count;
54	};	46	};
55	static DEFINE_PER_CPU(struct flow_percpu_info, flow_hash_info) = { 0 };
56		47
57	#define flow_hash_rnd_recalc(cpu) \	48	struct flow_flush_info {
58	(per_cpu(flow_hash_info, cpu).hash_rnd_recalc)	49	struct flow_cache *cache;
59	#define flow_hash_rnd(cpu) \	50	atomic_t cpuleft;
60	(per_cpu(flow_hash_info, cpu).hash_rnd)	51	struct completion completion;
61	#define flow_count(cpu) \	52	};
62	(per_cpu(flow_hash_info, cpu).count)
63		53
64	static struct timer_list flow_hash_rnd_timer;	54	struct flow_cache {
		55	u32 hash_shift;
		56	unsigned long order;
		57	struct flow_cache_percpu *percpu;
		58	struct notifier_block hotcpu_notifier;
		59	int low_watermark;
		60	int high_watermark;
		61	struct timer_list rnd_timer;
		62	};
65		63
66	#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ)	64	atomic_t flow_cache_genid = ATOMIC_INIT(0);
		65	EXPORT_SYMBOL(flow_cache_genid);
		66	static struct flow_cache flow_cache_global;
		67	static struct kmem_cache *flow_cachep;
67		68
68	struct flow_flush_info {	69	static DEFINE_SPINLOCK(flow_cache_gc_lock);
69	atomic_t cpuleft;	70	static LIST_HEAD(flow_cache_gc_list);
70	struct completion completion;
71	};
72	static DEFINE_PER_CPU(struct tasklet_struct, flow_flush_tasklets) = { NULL };
73		71
74	#define flow_flush_tasklet(cpu) (&per_cpu(flow_flush_tasklets, cpu))	72	#define flow_cache_hash_size(cache) (1 << (cache)->hash_shift)
		73	#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ)
75		74
76	static void flow_cache_new_hashrnd(unsigned long arg)	75	static void flow_cache_new_hashrnd(unsigned long arg)
77	{	76	{
		77	struct flow_cache fc = (void ) arg;
78	int i;	78	int i;
79		79
80	for_each_possible_cpu(i)	80	for_each_possible_cpu(i)
81	flow_hash_rnd_recalc(i) = 1;	81	per_cpu_ptr(fc->percpu, i)->hash_rnd_recalc = 1;
		82
		83	fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
		84	add_timer(&fc->rnd_timer);
		85	}
82		86
83	flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;	87	static int flow_entry_valid(struct flow_cache_entry *fle)
84	add_timer(&flow_hash_rnd_timer);	88	{
		89	if (atomic_read(&flow_cache_genid) != fle->genid)
		90	return 0;
		91	if (fle->object && !fle->object->ops->check(fle->object))
		92	return 0;
		93	return 1;
85	}	94	}
86		95
87	static void flow_entry_kill(int cpu, struct flow_cache_entry *fle)	96	static void flow_entry_kill(struct flow_cache_entry *fle)
88	{	97	{
89	if (fle->object)	98	if (fle->object)
90	atomic_dec(fle->object_ref);	99	fle->object->ops->delete(fle->object);
91	kmem_cache_free(flow_cachep, fle);	100	kmem_cache_free(flow_cachep, fle);
92	flow_count(cpu)--;
93	}	101	}
94		102
95	static void __flow_cache_shrink(int cpu, int shrink_to)	103	static void flow_cache_gc_task(struct work_struct *work)
96	{	104	{
97	struct flow_cache_entry fle, *flp;	105	struct list_head gc_list;
98	int i;	106	struct flow_cache_entry fce, n;
99		107
100	for (i = 0; i < flow_hash_size; i++) {	108	INIT_LIST_HEAD(&gc_list);
101	int k = 0;	109	spin_lock_bh(&flow_cache_gc_lock);
		110	list_splice_tail_init(&flow_cache_gc_list, &gc_list);
		111	spin_unlock_bh(&flow_cache_gc_lock);
102		112
103	flp = &flow_table(cpu)[i];	113	list_for_each_entry_safe(fce, n, &gc_list, u.gc_list)
104	while ((fle = *flp) != NULL && k < shrink_to) {	114	flow_entry_kill(fce);
105	k++;	115	}
106	flp = &fle->next;	116	static DECLARE_WORK(flow_cache_gc_work, flow_cache_gc_task);
107	}	117
108	while ((fle = *flp) != NULL) {	118	static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
109	*flp = fle->next;	119	int deleted, struct list_head *gc_list)
110	flow_entry_kill(cpu, fle);	120	{
111	}	121	if (deleted) {
		122	fcp->hash_count -= deleted;
		123	spin_lock_bh(&flow_cache_gc_lock);
		124	list_splice_tail(gc_list, &flow_cache_gc_list);
		125	spin_unlock_bh(&flow_cache_gc_lock);
		126	schedule_work(&flow_cache_gc_work);
112	}	127	}
113	}	128	}
114		129
115	static void flow_cache_shrink(int cpu)	130	static void __flow_cache_shrink(struct flow_cache *fc,
		131	struct flow_cache_percpu *fcp,
		132	int shrink_to)
116	{	133	{
117	int shrink_to = flow_lwm / flow_hash_size;	134	struct flow_cache_entry *fle;
		135	struct hlist_node entry, tmp;
		136	LIST_HEAD(gc_list);
		137	int i, deleted = 0;
		138
		139	for (i = 0; i < flow_cache_hash_size(fc); i++) {
		140	int saved = 0;
		141
		142	hlist_for_each_entry_safe(fle, entry, tmp,
		143	&fcp->hash_table[i], u.hlist) {
		144	if (saved < shrink_to &&
		145	flow_entry_valid(fle)) {
		146	saved++;
		147	} else {
		148	deleted++;
		149	hlist_del(&fle->u.hlist);
		150	list_add_tail(&fle->u.gc_list, &gc_list);
		151	}
		152	}
		153	}
118		154
119	__flow_cache_shrink(cpu, shrink_to);	155	flow_cache_queue_garbage(fcp, deleted, &gc_list);
120	}	156	}
121		157
122	static void flow_new_hash_rnd(int cpu)	158	static void flow_cache_shrink(struct flow_cache *fc,
		159	struct flow_cache_percpu *fcp)
123	{	160	{
124	get_random_bytes(&flow_hash_rnd(cpu), sizeof(u32));	161	int shrink_to = fc->low_watermark / flow_cache_hash_size(fc);
125	flow_hash_rnd_recalc(cpu) = 0;
126		162
127	__flow_cache_shrink(cpu, 0);	163	__flow_cache_shrink(fc, fcp, shrink_to);
128	}	164	}
129		165
130	static u32 flow_hash_code(struct flowi *key, int cpu)	166	static void flow_new_hash_rnd(struct flow_cache *fc,
		167	struct flow_cache_percpu *fcp)
		168	{
		169	get_random_bytes(&fcp->hash_rnd, sizeof(u32));
		170	fcp->hash_rnd_recalc = 0;
		171	__flow_cache_shrink(fc, fcp, 0);
		172	}
		173
		174	static u32 flow_hash_code(struct flow_cache *fc,
		175	struct flow_cache_percpu *fcp,
		176	struct flowi *key)
131	{	177	{
132	u32 k = (u32 ) key;	178	u32 k = (u32 ) key;
133		179
134	return (jhash2(k, (sizeof(*key) / sizeof(u32)), flow_hash_rnd(cpu)) &	180	return (jhash2(k, (sizeof(*key) / sizeof(u32)), fcp->hash_rnd)
135	(flow_hash_size - 1));	181	& (flow_cache_hash_size(fc) - 1));
136	}	182	}
137		183
138	#if (BITS_PER_LONG == 64)	184	#if (BITS_PER_LONG == 64)
@@ -165,114 +211,118 @@ static int flow_key_compare(struct flowi key1, struct flowi key2)
165	return 0;	211	return 0;
166	}	212	}
167		213
168	void flow_cache_lookup(struct net net, struct flowi *key, u16 family, u8 dir,	214	struct flow_cache_object *
169	flow_resolve_t resolver)	215	flow_cache_lookup(struct net net, struct flowi key, u16 family, u8 dir,
		216	flow_resolve_t resolver, void *ctx)
170	{	217	{
171	struct flow_cache_entry fle, *head;	218	struct flow_cache *fc = &flow_cache_global;
		219	struct flow_cache_percpu *fcp;
		220	struct flow_cache_entry fle, tfle;
		221	struct hlist_node *entry;
		222	struct flow_cache_object *flo;
172	unsigned int hash;	223	unsigned int hash;
173	int cpu;
174		224
175	local_bh_disable();	225	local_bh_disable();
176	cpu = smp_processor_id();	226	fcp = this_cpu_ptr(fc->percpu);
177		227
178	fle = NULL;	228	fle = NULL;
		229	flo = NULL;
179	/* Packet really early in init? Making flow_cache_init a	230	/* Packet really early in init? Making flow_cache_init a
180	* pre-smp initcall would solve this. --RR */	231	* pre-smp initcall would solve this. --RR */
181	if (!flow_table(cpu))	232	if (!fcp->hash_table)
182	goto nocache;	233	goto nocache;
183		234
184	if (flow_hash_rnd_recalc(cpu))	235	if (fcp->hash_rnd_recalc)
185	flow_new_hash_rnd(cpu);	236	flow_new_hash_rnd(fc, fcp);
186	hash = flow_hash_code(key, cpu);
187		237
188	head = &flow_table(cpu)[hash];	238	hash = flow_hash_code(fc, fcp, key);
189	for (fle = *head; fle; fle = fle->next) {	239	hlist_for_each_entry(tfle, entry, &fcp->hash_table[hash], u.hlist) {
190	if (fle->family == family &&	240	if (tfle->family == family &&
191	fle->dir == dir &&	241	tfle->dir == dir &&
192	flow_key_compare(key, &fle->key) == 0) {	242	flow_key_compare(key, &tfle->key) == 0) {
193	if (fle->genid == atomic_read(&flow_cache_genid)) {	243	fle = tfle;
194	void *ret = fle->object;
195
196	if (ret)
197	atomic_inc(fle->object_ref);
198	local_bh_enable();
199
200	return ret;
201	}
202	break;	244	break;
203	}	245	}
204	}	246	}
205		247
206	if (!fle) {	248	if (unlikely(!fle)) {
207	if (flow_count(cpu) > flow_hwm)	249	if (fcp->hash_count > fc->high_watermark)
208	flow_cache_shrink(cpu);	250	flow_cache_shrink(fc, fcp);
209		251
210	fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);	252	fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
211	if (fle) {	253	if (fle) {
212	fle->next = *head;
213	*head = fle;
214	fle->family = family;	254	fle->family = family;
215	fle->dir = dir;	255	fle->dir = dir;
216	memcpy(&fle->key, key, sizeof(*key));	256	memcpy(&fle->key, key, sizeof(*key));
217	fle->object = NULL;	257	fle->object = NULL;
218	flow_count(cpu)++;	258	hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]);
		259	fcp->hash_count++;
219	}	260	}
		261	} else if (likely(fle->genid == atomic_read(&flow_cache_genid))) {
		262	flo = fle->object;
		263	if (!flo)
		264	goto ret_object;
		265	flo = flo->ops->get(flo);
		266	if (flo)
		267	goto ret_object;
		268	} else if (fle->object) {
		269	flo = fle->object;
		270	flo->ops->delete(flo);
		271	fle->object = NULL;
220	}	272	}
221		273
222	nocache:	274	nocache:
223	{	275	flo = NULL;
224	int err;	276	if (fle) {
225	void *obj;	277	flo = fle->object;
226	atomic_t *obj_ref;	278	fle->object = NULL;
227
228	err = resolver(net, key, family, dir, &obj, &obj_ref);
229
230	if (fle && !err) {
231	fle->genid = atomic_read(&flow_cache_genid);
232
233	if (fle->object)
234	atomic_dec(fle->object_ref);
235
236	fle->object = obj;
237	fle->object_ref = obj_ref;
238	if (obj)
239	atomic_inc(fle->object_ref);
240	}
241	local_bh_enable();
242
243	if (err)
244	obj = ERR_PTR(err);
245	return obj;
246	}	279	}
		280	flo = resolver(net, key, family, dir, flo, ctx);
		281	if (fle) {
		282	fle->genid = atomic_read(&flow_cache_genid);
		283	if (!IS_ERR(flo))
		284	fle->object = flo;
		285	else
		286	fle->genid--;
		287	} else {
		288	if (flo && !IS_ERR(flo))
		289	flo->ops->delete(flo);
		290	}
		291	ret_object:
		292	local_bh_enable();
		293	return flo;
247	}	294	}
		295	EXPORT_SYMBOL(flow_cache_lookup);
248		296
249	static void flow_cache_flush_tasklet(unsigned long data)	297	static void flow_cache_flush_tasklet(unsigned long data)
250	{	298	{
251	struct flow_flush_info info = (void )data;	299	struct flow_flush_info info = (void )data;
252	int i;	300	struct flow_cache *fc = info->cache;
253	int cpu;	301	struct flow_cache_percpu *fcp;
254		302	struct flow_cache_entry *fle;
255	cpu = smp_processor_id();	303	struct hlist_node entry, tmp;
256	for (i = 0; i < flow_hash_size; i++) {	304	LIST_HEAD(gc_list);
257	struct flow_cache_entry *fle;	305	int i, deleted = 0;
258		306
259	fle = flow_table(cpu)[i];	307	fcp = this_cpu_ptr(fc->percpu);
260	for (; fle; fle = fle->next) {	308	for (i = 0; i < flow_cache_hash_size(fc); i++) {
261	unsigned genid = atomic_read(&flow_cache_genid);	309	hlist_for_each_entry_safe(fle, entry, tmp,
262		310	&fcp->hash_table[i], u.hlist) {
263	if (!fle->object \|\| fle->genid == genid)	311	if (flow_entry_valid(fle))
264	continue;	312	continue;
265		313
266	fle->object = NULL;	314	deleted++;
267	atomic_dec(fle->object_ref);	315	hlist_del(&fle->u.hlist);
		316	list_add_tail(&fle->u.gc_list, &gc_list);
268	}	317	}
269	}	318	}
270		319
		320	flow_cache_queue_garbage(fcp, deleted, &gc_list);
		321
271	if (atomic_dec_and_test(&info->cpuleft))	322	if (atomic_dec_and_test(&info->cpuleft))
272	complete(&info->completion);	323	complete(&info->completion);
273	}	324	}
274		325
275	static void flow_cache_flush_per_cpu(void *) __attribute__((__unused__));
276	static void flow_cache_flush_per_cpu(void *data)	326	static void flow_cache_flush_per_cpu(void *data)
277	{	327	{
278	struct flow_flush_info *info = data;	328	struct flow_flush_info *info = data;
@@ -280,8 +330,7 @@ static void flow_cache_flush_per_cpu(void *data)
280	struct tasklet_struct *tasklet;	330	struct tasklet_struct *tasklet;
281		331
282	cpu = smp_processor_id();	332	cpu = smp_processor_id();
283		333	tasklet = &per_cpu_ptr(info->cache->percpu, cpu)->flush_tasklet;
284	tasklet = flow_flush_tasklet(cpu);
285	tasklet->data = (unsigned long)info;	334	tasklet->data = (unsigned long)info;
286	tasklet_schedule(tasklet);	335	tasklet_schedule(tasklet);
287	}	336	}
@@ -294,6 +343,7 @@ void flow_cache_flush(void)
294	/* Don't want cpus going down or up during this. */	343	/* Don't want cpus going down or up during this. */
295	get_online_cpus();	344	get_online_cpus();
296	mutex_lock(&flow_flush_sem);	345	mutex_lock(&flow_flush_sem);
		346	info.cache = &flow_cache_global;
297	atomic_set(&info.cpuleft, num_online_cpus());	347	atomic_set(&info.cpuleft, num_online_cpus());
298	init_completion(&info.completion);	348	init_completion(&info.completion);
299		349
@@ -307,62 +357,72 @@ void flow_cache_flush(void)
307	put_online_cpus();	357	put_online_cpus();
308	}	358	}
309		359
310	static void __init flow_cache_cpu_prepare(int cpu)	360	static void __init flow_cache_cpu_prepare(struct flow_cache *fc,
		361	struct flow_cache_percpu *fcp)
311	{	362	{
312	struct tasklet_struct *tasklet;	363	fcp->hash_table = (struct hlist_head *)
313	unsigned long order;	364	__get_free_pages(GFP_KERNEL\|__GFP_ZERO, fc->order);
314		365	if (!fcp->hash_table)
315	for (order = 0;	366	panic("NET: failed to allocate flow cache order %lu\n", fc->order);
316	(PAGE_SIZE << order) <	367
317	(sizeof(struct flow_cache_entry )flow_hash_size);	368	fcp->hash_rnd_recalc = 1;
318	order++)	369	fcp->hash_count = 0;
319	/* NOTHING */;	370	tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0);
320
321	flow_table(cpu) = (struct flow_cache_entry **)
322	__get_free_pages(GFP_KERNEL\|__GFP_ZERO, order);
323	if (!flow_table(cpu))
324	panic("NET: failed to allocate flow cache order %lu\n", order);
325
326	flow_hash_rnd_recalc(cpu) = 1;
327	flow_count(cpu) = 0;
328
329	tasklet = flow_flush_tasklet(cpu);
330	tasklet_init(tasklet, flow_cache_flush_tasklet, 0);
331	}	371	}
332		372
333	static int flow_cache_cpu(struct notifier_block *nfb,	373	static int flow_cache_cpu(struct notifier_block *nfb,
334	unsigned long action,	374	unsigned long action,
335	void *hcpu)	375	void *hcpu)
336	{	376	{
		377	struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier);
		378	int cpu = (unsigned long) hcpu;
		379	struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
		380
337	if (action == CPU_DEAD \|\| action == CPU_DEAD_FROZEN)	381	if (action == CPU_DEAD \|\| action == CPU_DEAD_FROZEN)
338	__flow_cache_shrink((unsigned long)hcpu, 0);	382	__flow_cache_shrink(fc, fcp, 0);
339	return NOTIFY_OK;	383	return NOTIFY_OK;
340	}	384	}
341		385
342	static int __init flow_cache_init(void)	386	static int flow_cache_init(struct flow_cache *fc)
343	{	387	{
		388	unsigned long order;
344	int i;	389	int i;
345		390
346	flow_cachep = kmem_cache_create("flow_cache",	391	fc->hash_shift = 10;
347	sizeof(struct flow_cache_entry),	392	fc->low_watermark = 2 * flow_cache_hash_size(fc);
348	0, SLAB_PANIC,	393	fc->high_watermark = 4 * flow_cache_hash_size(fc);
349	NULL);
350	flow_hash_shift = 10;
351	flow_lwm = 2 * flow_hash_size;
352	flow_hwm = 4 * flow_hash_size;
353		394
354	setup_timer(&flow_hash_rnd_timer, flow_cache_new_hashrnd, 0);	395	for (order = 0;
355	flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;	396	(PAGE_SIZE << order) <
356	add_timer(&flow_hash_rnd_timer);	397	(sizeof(struct hlist_head)*flow_cache_hash_size(fc));
		398	order++)
		399	/* NOTHING */;
		400	fc->order = order;
		401	fc->percpu = alloc_percpu(struct flow_cache_percpu);
		402
		403	setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
		404	(unsigned long) fc);
		405	fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
		406	add_timer(&fc->rnd_timer);
357		407
358	for_each_possible_cpu(i)	408	for_each_possible_cpu(i)
359	flow_cache_cpu_prepare(i);	409	flow_cache_cpu_prepare(fc, per_cpu_ptr(fc->percpu, i));
		410
		411	fc->hotcpu_notifier = (struct notifier_block){
		412	.notifier_call = flow_cache_cpu,
		413	};
		414	register_hotcpu_notifier(&fc->hotcpu_notifier);
360		415
361	hotcpu_notifier(flow_cache_cpu, 0);
362	return 0;	416	return 0;
363	}	417	}
364		418
365	module_init(flow_cache_init);	419	static int __init flow_cache_init_global(void)
		420	{
		421	flow_cachep = kmem_cache_create("flow_cache",
		422	sizeof(struct flow_cache_entry),
		423	0, SLAB_PANIC, NULL);
366		424
367	EXPORT_SYMBOL(flow_cache_genid);	425	return flow_cache_init(&flow_cache_global);
368	EXPORT_SYMBOL(flow_cache_lookup);	426	}
		427
		428	module_init(flow_cache_init_global);