1 files changed, 233 insertions, 172 deletions
diff --git a/net/core/flow.c b/net/core/flow.c
index 96015871ece..16190067400 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -26,113 +26,158 @@
 #include <linux/security.h>
 struct flow_cache_entry {
-        struct flow_cache_entry *next;
+        union {
-        u16                     family;
+                struct hlist_node       hlist;
-        u8                      dir;
+                struct list_head        gc_list;
-        u32                     genid;
+        } u;
-        struct flowi            key;
+        u16                             family;
-        void                    *object;
+        u8                              dir;
-        atomic_t                *object_ref;
+        u32                             genid;
+        struct flowi                    key;
+        struct flow_cache_object        *object;
 };
-atomic_t flow_cache_genid = ATOMIC_INIT(0);
+struct flow_cache_percpu {
+        struct hlist_head               *hash_table;
-static u32 flow_hash_shift;
+        int                             hash_count;
-#define flow_hash_size  (1 << flow_hash_shift)
+        u32                             hash_rnd;
-static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL };
+        int                             hash_rnd_recalc;
+        struct tasklet_struct           flush_tasklet;
-#define flow_table(cpu) (per_cpu(flow_tables, cpu))
+};
-static struct kmem_cache *flow_cachep __read_mostly;
-static int flow_lwm, flow_hwm;
+struct flow_flush_info {
+        struct flow_cache               *cache;
+        atomic_t                        cpuleft;
+        struct completion               completion;
+};
-struct flow_percpu_info {
+struct flow_cache {
-        int hash_rnd_recalc;
+        u32                             hash_shift;
-        u32 hash_rnd;
+        unsigned long                   order;
-        int count;
+        struct flow_cache_percpu        *percpu;
+        struct notifier_block           hotcpu_notifier;
+        int                             low_watermark;
+        int                             high_watermark;
+        struct timer_list               rnd_timer;
 };
-static DEFINE_PER_CPU(struct flow_percpu_info, flow_hash_info) = { 0 };
-#define flow_hash_rnd_recalc(cpu) \
+atomic_t flow_cache_genid = ATOMIC_INIT(0);
-        (per_cpu(flow_hash_info, cpu).hash_rnd_recalc)
+static struct flow_cache flow_cache_global;
-#define flow_hash_rnd(cpu) \
+static struct kmem_cache *flow_cachep;
-        (per_cpu(flow_hash_info, cpu).hash_rnd)
-#define flow_count(cpu) \
-        (per_cpu(flow_hash_info, cpu).count)
-static struct timer_list flow_hash_rnd_timer;
+static DEFINE_SPINLOCK(flow_cache_gc_lock);
+static LIST_HEAD(flow_cache_gc_list);
-#define FLOW_HASH_RND_PERIOD    (10 * 60 * HZ)
+#define flow_cache_hash_size(cache)     (1 << (cache)->hash_shift)
+#define FLOW_HASH_RND_PERIOD            (10 * 60 * HZ)
-struct flow_flush_info {
-        atomic_t cpuleft;
-        struct completion completion;
-};
-static DEFINE_PER_CPU(struct tasklet_struct, flow_flush_tasklets) = { NULL };
-#define flow_flush_tasklet(cpu) (&per_cpu(flow_flush_tasklets, cpu))
 static void flow_cache_new_hashrnd(unsigned long arg)
 {
+        struct flow_cache *fc = (void *) arg;
        int i;
        for_each_possible_cpu(i)
-                flow_hash_rnd_recalc(i) = 1;
+                per_cpu_ptr(fc->percpu, i)->hash_rnd_recalc = 1;
-        flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
+        fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
-        add_timer(&flow_hash_rnd_timer);
+        add_timer(&fc->rnd_timer);
+}
+static int flow_entry_valid(struct flow_cache_entry *fle)
+{
+        if (atomic_read(&flow_cache_genid) != fle->genid)
+                return 0;
+        if (fle->object && !fle->object->ops->check(fle->object))
+                return 0;
+        return 1;
 }
-static void flow_entry_kill(int cpu, struct flow_cache_entry *fle)
+static void flow_entry_kill(struct flow_cache_entry *fle)
 {
        if (fle->object)
-                atomic_dec(fle->object_ref);
+                fle->object->ops->delete(fle->object);
        kmem_cache_free(flow_cachep, fle);
-        flow_count(cpu)--;
 }
-static void __flow_cache_shrink(int cpu, int shrink_to)
+static void flow_cache_gc_task(struct work_struct *work)
 {
-        struct flow_cache_entry *fle, **flp;
+        struct list_head gc_list;
-        int i;
+        struct flow_cache_entry *fce, *n;
-        for (i = 0; i < flow_hash_size; i++) {
+        INIT_LIST_HEAD(&gc_list);
-                int k = 0;
+        spin_lock_bh(&flow_cache_gc_lock);
+        list_splice_tail_init(&flow_cache_gc_list, &gc_list);
+        spin_unlock_bh(&flow_cache_gc_lock);
-                flp = &flow_table(cpu)[i];
+        list_for_each_entry_safe(fce, n, &gc_list, u.gc_list)
-                while ((fle = *flp) != NULL && k < shrink_to) {
+                flow_entry_kill(fce);
-                        k++;
+}
-                        flp = &fle->next;
+static DECLARE_WORK(flow_cache_gc_work, flow_cache_gc_task);
-                }
-                while ((fle = *flp) != NULL) {
+static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
-                        *flp = fle->next;
+                                     int deleted, struct list_head *gc_list)
-                        flow_entry_kill(cpu, fle);
+{
-                }
+        if (deleted) {
+                fcp->hash_count -= deleted;
+                spin_lock_bh(&flow_cache_gc_lock);
+                list_splice_tail(gc_list, &flow_cache_gc_list);
+                spin_unlock_bh(&flow_cache_gc_lock);
+                schedule_work(&flow_cache_gc_work);
        }
 }
-static void flow_cache_shrink(int cpu)
+static void __flow_cache_shrink(struct flow_cache *fc,
+                                struct flow_cache_percpu *fcp,
+                                int shrink_to)
 {
-        int shrink_to = flow_lwm / flow_hash_size;
+        struct flow_cache_entry *fle;
+        struct hlist_node *entry, *tmp;
+        LIST_HEAD(gc_list);
+        int i, deleted = 0;
+        for (i = 0; i < flow_cache_hash_size(fc); i++) {
+                int saved = 0;
+                hlist_for_each_entry_safe(fle, entry, tmp,
+                                          &fcp->hash_table[i], u.hlist) {
+                        if (saved < shrink_to &&
+                            flow_entry_valid(fle)) {
+                                saved++;
+                        } else {
+                                deleted++;
+                                hlist_del(&fle->u.hlist);
+                                list_add_tail(&fle->u.gc_list, &gc_list);
+                        }
+                }
+        }
-        __flow_cache_shrink(cpu, shrink_to);
+        flow_cache_queue_garbage(fcp, deleted, &gc_list);
 }
-static void flow_new_hash_rnd(int cpu)
+static void flow_cache_shrink(struct flow_cache *fc,
+                              struct flow_cache_percpu *fcp)
 {
-        get_random_bytes(&flow_hash_rnd(cpu), sizeof(u32));
+        int shrink_to = fc->low_watermark / flow_cache_hash_size(fc);
-        flow_hash_rnd_recalc(cpu) = 0;
-        __flow_cache_shrink(cpu, 0);
+        __flow_cache_shrink(fc, fcp, shrink_to);
 }
-static u32 flow_hash_code(struct flowi *key, int cpu)
+static void flow_new_hash_rnd(struct flow_cache *fc,
+                              struct flow_cache_percpu *fcp)
+{
+        get_random_bytes(&fcp->hash_rnd, sizeof(u32));
+        fcp->hash_rnd_recalc = 0;
+        __flow_cache_shrink(fc, fcp, 0);
+}
+static u32 flow_hash_code(struct flow_cache *fc,
+                          struct flow_cache_percpu *fcp,
+                          struct flowi *key)
 {
        u32 *k = (u32 *) key;
-        return (jhash2(k, (sizeof(*key) / sizeof(u32)), flow_hash_rnd(cpu)) &
+        return (jhash2(k, (sizeof(*key) / sizeof(u32)), fcp->hash_rnd)
-                (flow_hash_size - 1));
+                & (flow_cache_hash_size(fc) - 1));
 }
 #if (BITS_PER_LONG == 64)
@@ -165,114 +210,117 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2)
        return 0;
 }
-void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
+struct flow_cache_object *
-                        flow_resolve_t resolver)
+flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
+                  flow_resolve_t resolver, void *ctx)
 {
-        struct flow_cache_entry *fle, **head;
+        struct flow_cache *fc = &flow_cache_global;
+        struct flow_cache_percpu *fcp;
+        struct flow_cache_entry *fle, *tfle;
+        struct hlist_node *entry;
+        struct flow_cache_object *flo;
        unsigned int hash;
-        int cpu;
        local_bh_disable();
-        cpu = smp_processor_id();
+        fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
        fle = NULL;
+        flo = NULL;
        /* Packet really early in init?  Making flow_cache_init a
         * pre-smp initcall would solve this.  --RR */
-        if (!flow_table(cpu))
+        if (!fcp->hash_table)
                goto nocache;
-        if (flow_hash_rnd_recalc(cpu))
+        if (fcp->hash_rnd_recalc)
-                flow_new_hash_rnd(cpu);
+                flow_new_hash_rnd(fc, fcp);
-        hash = flow_hash_code(key, cpu);
-        head = &flow_table(cpu)[hash];
+        hash = flow_hash_code(fc, fcp, key);
-        for (fle = *head; fle; fle = fle->next) {
+        hlist_for_each_entry(tfle, entry, &fcp->hash_table[hash], u.hlist) {
-                if (fle->family == family &&
+                if (tfle->family == family &&
-                    fle->dir == dir &&
+                    tfle->dir == dir &&
-                    flow_key_compare(key, &fle->key) == 0) {
+                    flow_key_compare(key, &tfle->key) == 0) {
-                        if (fle->genid == atomic_read(&flow_cache_genid)) {
+                        fle = tfle;
-                                void *ret = fle->object;
-                                if (ret)
-                                        atomic_inc(fle->object_ref);
-                                local_bh_enable();
-                                return ret;
-                        }
                        break;
                }
        }
-        if (!fle) {
+        if (unlikely(!fle)) {
-                if (flow_count(cpu) > flow_hwm)
+                if (fcp->hash_count > fc->high_watermark)
-                        flow_cache_shrink(cpu);
+                        flow_cache_shrink(fc, fcp);
                fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
                if (fle) {
-                        fle->next = *head;
-                        *head = fle;
                        fle->family = family;
                        fle->dir = dir;
                        memcpy(&fle->key, key, sizeof(*key));
                        fle->object = NULL;
-                        flow_count(cpu)++;
+                        hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]);
+                        fcp->hash_count++;
                }
+        } else if (likely(fle->genid == atomic_read(&flow_cache_genid))) {
+                flo = fle->object;
+                if (!flo)
+                        goto ret_object;
+                flo = flo->ops->get(flo);
+                if (flo)
+                        goto ret_object;
+        } else if (fle->object) {
+                flo = fle->object;
+                flo->ops->delete(flo);
+                fle->object = NULL;
        }
 nocache:
-        {
+        flo = NULL;
-                int err;
+        if (fle) {
-                void *obj;
+                flo = fle->object;
-                atomic_t *obj_ref;
+                fle->object = NULL;
-                err = resolver(net, key, family, dir, &obj, &obj_ref);
-                if (fle && !err) {
-                        fle->genid = atomic_read(&flow_cache_genid);
-                        if (fle->object)
-                                atomic_dec(fle->object_ref);
-                        fle->object = obj;
-                        fle->object_ref = obj_ref;
-                        if (obj)
-                                atomic_inc(fle->object_ref);
-                }
-                local_bh_enable();
-                if (err)
-                        obj = ERR_PTR(err);
-                return obj;
        }
+        flo = resolver(net, key, family, dir, flo, ctx);
+        if (fle) {
+                fle->genid = atomic_read(&flow_cache_genid);
+                if (!IS_ERR(flo))
+                        fle->object = flo;
+                else
+                        fle->genid--;
+        } else {
+                if (flo && !IS_ERR(flo))
+                        flo->ops->delete(flo);
+        }
+ret_object:
+        local_bh_enable();
+        return flo;
 }
 static void flow_cache_flush_tasklet(unsigned long data)
 {
        struct flow_flush_info *info = (void *)data;
-        int i;
+        struct flow_cache *fc = info->cache;
-        int cpu;
+        struct flow_cache_percpu *fcp;
+        struct flow_cache_entry *fle;
-        cpu = smp_processor_id();
+        struct hlist_node *entry, *tmp;
-        for (i = 0; i < flow_hash_size; i++) {
+        LIST_HEAD(gc_list);
-                struct flow_cache_entry *fle;
+        int i, deleted = 0;
-                fle = flow_table(cpu)[i];
+        fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
-                for (; fle; fle = fle->next) {
+        for (i = 0; i < flow_cache_hash_size(fc); i++) {
-                        unsigned genid = atomic_read(&flow_cache_genid);
+                hlist_for_each_entry_safe(fle, entry, tmp,
+                                          &fcp->hash_table[i], u.hlist) {
-                        if (!fle->object || fle->genid == genid)
+                        if (flow_entry_valid(fle))
                                continue;
-                        fle->object = NULL;
+                        deleted++;
-                        atomic_dec(fle->object_ref);
+                        hlist_del(&fle->u.hlist);
+                        list_add_tail(&fle->u.gc_list, &gc_list);
                }
        }
+        flow_cache_queue_garbage(fcp, deleted, &gc_list);
        if (atomic_dec_and_test(&info->cpuleft))
                complete(&info->completion);
 }
-static void flow_cache_flush_per_cpu(void *) __attribute__((__unused__));
 static void flow_cache_flush_per_cpu(void *data)
 {
        struct flow_flush_info *info = data;
@@ -280,8 +328,7 @@ static void flow_cache_flush_per_cpu(void *data)
        struct tasklet_struct *tasklet;
        cpu = smp_processor_id();
+        tasklet = &per_cpu_ptr(info->cache->percpu, cpu)->flush_tasklet;
-        tasklet = flow_flush_tasklet(cpu);
        tasklet->data = (unsigned long)info;
        tasklet_schedule(tasklet);
 }
@@ -294,6 +341,7 @@ void flow_cache_flush(void)
        /* Don't want cpus going down or up during this. */
        get_online_cpus();
        mutex_lock(&flow_flush_sem);
+        info.cache = &flow_cache_global;
        atomic_set(&info.cpuleft, num_online_cpus());
        init_completion(&info.completion);
@@ -307,62 +355,75 @@ void flow_cache_flush(void)
        put_online_cpus();
 }
-static void __init flow_cache_cpu_prepare(int cpu)
+static void __init flow_cache_cpu_prepare(struct flow_cache *fc,
+                                          struct flow_cache_percpu *fcp)
 {
-        struct tasklet_struct *tasklet;
+        fcp->hash_table = (struct hlist_head *)
-        unsigned long order;
+                __get_free_pages(GFP_KERNEL|__GFP_ZERO, fc->order);
+        if (!fcp->hash_table)
-        for (order = 0;
+                panic("NET: failed to allocate flow cache order %lu\n", fc->order);
-             (PAGE_SIZE << order) <
-                     (sizeof(struct flow_cache_entry *)*flow_hash_size);
+        fcp->hash_rnd_recalc = 1;
-             order++)
+        fcp->hash_count = 0;
-                /* NOTHING */;
+        tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0);
-        flow_table(cpu) = (struct flow_cache_entry **)
-                __get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
-        if (!flow_table(cpu))
-                panic("NET: failed to allocate flow cache order %lu\n", order);
-        flow_hash_rnd_recalc(cpu) = 1;
-        flow_count(cpu) = 0;
-        tasklet = flow_flush_tasklet(cpu);
-        tasklet_init(tasklet, flow_cache_flush_tasklet, 0);
 }
 static int flow_cache_cpu(struct notifier_block *nfb,
                          unsigned long action,
                          void *hcpu)
 {
+        struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier);
+        int cpu = (unsigned long) hcpu;
+        struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
-                __flow_cache_shrink((unsigned long)hcpu, 0);
+                __flow_cache_shrink(fc, fcp, 0);
        return NOTIFY_OK;
 }
-static int __init flow_cache_init(void)
+static int flow_cache_init(struct flow_cache *fc)
 {
+        unsigned long order;
        int i;
-        flow_cachep = kmem_cache_create("flow_cache",
+        fc->hash_shift = 10;
-                                        sizeof(struct flow_cache_entry),
+        fc->low_watermark = 2 * flow_cache_hash_size(fc);
-                                        0, SLAB_PANIC,
+        fc->high_watermark = 4 * flow_cache_hash_size(fc);
-                                        NULL);
-        flow_hash_shift = 10;
+        for (order = 0;
-        flow_lwm = 2 * flow_hash_size;
+             (PAGE_SIZE << order) <
-        flow_hwm = 4 * flow_hash_size;
+                     (sizeof(struct hlist_head)*flow_cache_hash_size(fc));
+             order++)
+                /* NOTHING */;
+        fc->order = order;
+        fc->percpu = alloc_percpu(struct flow_cache_percpu);
-        setup_timer(&flow_hash_rnd_timer, flow_cache_new_hashrnd, 0);
+        setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
-        flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
+                    (unsigned long) fc);
-        add_timer(&flow_hash_rnd_timer);
+        fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
+        add_timer(&fc->rnd_timer);
        for_each_possible_cpu(i)
-                flow_cache_cpu_prepare(i);
+                flow_cache_cpu_prepare(fc, per_cpu_ptr(fc->percpu, i));
+        fc->hotcpu_notifier = (struct notifier_block){
+                .notifier_call = flow_cache_cpu,
+        };
+        register_hotcpu_notifier(&fc->hotcpu_notifier);
-        hotcpu_notifier(flow_cache_cpu, 0);
        return 0;
 }
-module_init(flow_cache_init);
+static int __init flow_cache_init_global(void)
+{
+        flow_cachep = kmem_cache_create("flow_cache",
+                                        sizeof(struct flow_cache_entry),
+                                        0, SLAB_PANIC, NULL);
+        return flow_cache_init(&flow_cache_global);
+}
+module_init(flow_cache_init_global);
 EXPORT_SYMBOL(flow_cache_genid);
 EXPORT_SYMBOL(flow_cache_lookup);

diff --git a/net/core/flow.c b/net/core/flow.c index 96015871ece..16190067400 100644 --- a/net/core/flow.c +++ b/net/core/flow.c
@@ -26,113 +26,158 @@
26	#include <linux/security.h>	26	#include <linux/security.h>
27		27
28	struct flow_cache_entry {	28	struct flow_cache_entry {
29	struct flow_cache_entry *next;	29	union {
30	u16 family;	30	struct hlist_node hlist;
31	u8 dir;	31	struct list_head gc_list;
32	u32 genid;	32	} u;
33	struct flowi key;	33	u16 family;
34	void *object;	34	u8 dir;
35	atomic_t *object_ref;	35	u32 genid;
		36	struct flowi key;
		37	struct flow_cache_object *object;
36	};	38	};
37		39
38	atomic_t flow_cache_genid = ATOMIC_INIT(0);	40	struct flow_cache_percpu {
39		41	struct hlist_head *hash_table;
40	static u32 flow_hash_shift;	42	int hash_count;
41	#define flow_hash_size (1 << flow_hash_shift)	43	u32 hash_rnd;
42	static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL };	44	int hash_rnd_recalc;
43		45	struct tasklet_struct flush_tasklet;
44	#define flow_table(cpu) (per_cpu(flow_tables, cpu))	46	};
45
46	static struct kmem_cache *flow_cachep __read_mostly;
47		47
48	static int flow_lwm, flow_hwm;	48	struct flow_flush_info {
		49	struct flow_cache *cache;
		50	atomic_t cpuleft;
		51	struct completion completion;
		52	};
49		53
50	struct flow_percpu_info {	54	struct flow_cache {
51	int hash_rnd_recalc;	55	u32 hash_shift;
52	u32 hash_rnd;	56	unsigned long order;
53	int count;	57	struct flow_cache_percpu *percpu;
		58	struct notifier_block hotcpu_notifier;
		59	int low_watermark;
		60	int high_watermark;
		61	struct timer_list rnd_timer;
54	};	62	};
55	static DEFINE_PER_CPU(struct flow_percpu_info, flow_hash_info) = { 0 };
56		63
57	#define flow_hash_rnd_recalc(cpu) \	64	atomic_t flow_cache_genid = ATOMIC_INIT(0);
58	(per_cpu(flow_hash_info, cpu).hash_rnd_recalc)	65	static struct flow_cache flow_cache_global;
59	#define flow_hash_rnd(cpu) \	66	static struct kmem_cache *flow_cachep;
60	(per_cpu(flow_hash_info, cpu).hash_rnd)
61	#define flow_count(cpu) \
62	(per_cpu(flow_hash_info, cpu).count)
63		67
64	static struct timer_list flow_hash_rnd_timer;	68	static DEFINE_SPINLOCK(flow_cache_gc_lock);
		69	static LIST_HEAD(flow_cache_gc_list);
65		70
66	#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ)	71	#define flow_cache_hash_size(cache) (1 << (cache)->hash_shift)
67		72	#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ)
68	struct flow_flush_info {
69	atomic_t cpuleft;
70	struct completion completion;
71	};
72	static DEFINE_PER_CPU(struct tasklet_struct, flow_flush_tasklets) = { NULL };
73
74	#define flow_flush_tasklet(cpu) (&per_cpu(flow_flush_tasklets, cpu))
75		73
76	static void flow_cache_new_hashrnd(unsigned long arg)	74	static void flow_cache_new_hashrnd(unsigned long arg)
77	{	75	{
		76	struct flow_cache fc = (void ) arg;
78	int i;	77	int i;
79		78
80	for_each_possible_cpu(i)	79	for_each_possible_cpu(i)
81	flow_hash_rnd_recalc(i) = 1;	80	per_cpu_ptr(fc->percpu, i)->hash_rnd_recalc = 1;
82		81
83	flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;	82	fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
84	add_timer(&flow_hash_rnd_timer);	83	add_timer(&fc->rnd_timer);
		84	}
		85
		86	static int flow_entry_valid(struct flow_cache_entry *fle)
		87	{
		88	if (atomic_read(&flow_cache_genid) != fle->genid)
		89	return 0;
		90	if (fle->object && !fle->object->ops->check(fle->object))
		91	return 0;
		92	return 1;
85	}	93	}
86		94
87	static void flow_entry_kill(int cpu, struct flow_cache_entry *fle)	95	static void flow_entry_kill(struct flow_cache_entry *fle)
88	{	96	{
89	if (fle->object)	97	if (fle->object)
90	atomic_dec(fle->object_ref);	98	fle->object->ops->delete(fle->object);
91	kmem_cache_free(flow_cachep, fle);	99	kmem_cache_free(flow_cachep, fle);
92	flow_count(cpu)--;
93	}	100	}
94		101
95	static void __flow_cache_shrink(int cpu, int shrink_to)	102	static void flow_cache_gc_task(struct work_struct *work)
96	{	103	{
97	struct flow_cache_entry fle, *flp;	104	struct list_head gc_list;
98	int i;	105	struct flow_cache_entry fce, n;
99		106
100	for (i = 0; i < flow_hash_size; i++) {	107	INIT_LIST_HEAD(&gc_list);
101	int k = 0;	108	spin_lock_bh(&flow_cache_gc_lock);
		109	list_splice_tail_init(&flow_cache_gc_list, &gc_list);
		110	spin_unlock_bh(&flow_cache_gc_lock);
102		111
103	flp = &flow_table(cpu)[i];	112	list_for_each_entry_safe(fce, n, &gc_list, u.gc_list)
104	while ((fle = *flp) != NULL && k < shrink_to) {	113	flow_entry_kill(fce);
105	k++;	114	}
106	flp = &fle->next;	115	static DECLARE_WORK(flow_cache_gc_work, flow_cache_gc_task);
107	}	116
108	while ((fle = *flp) != NULL) {	117	static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
109	*flp = fle->next;	118	int deleted, struct list_head *gc_list)
110	flow_entry_kill(cpu, fle);	119	{
111	}	120	if (deleted) {
		121	fcp->hash_count -= deleted;
		122	spin_lock_bh(&flow_cache_gc_lock);
		123	list_splice_tail(gc_list, &flow_cache_gc_list);
		124	spin_unlock_bh(&flow_cache_gc_lock);
		125	schedule_work(&flow_cache_gc_work);
112	}	126	}
113	}	127	}
114		128
115	static void flow_cache_shrink(int cpu)	129	static void __flow_cache_shrink(struct flow_cache *fc,
		130	struct flow_cache_percpu *fcp,
		131	int shrink_to)
116	{	132	{
117	int shrink_to = flow_lwm / flow_hash_size;	133	struct flow_cache_entry *fle;
		134	struct hlist_node entry, tmp;
		135	LIST_HEAD(gc_list);
		136	int i, deleted = 0;
		137
		138	for (i = 0; i < flow_cache_hash_size(fc); i++) {
		139	int saved = 0;
		140
		141	hlist_for_each_entry_safe(fle, entry, tmp,
		142	&fcp->hash_table[i], u.hlist) {
		143	if (saved < shrink_to &&
		144	flow_entry_valid(fle)) {
		145	saved++;
		146	} else {
		147	deleted++;
		148	hlist_del(&fle->u.hlist);
		149	list_add_tail(&fle->u.gc_list, &gc_list);
		150	}
		151	}
		152	}
118		153
119	__flow_cache_shrink(cpu, shrink_to);	154	flow_cache_queue_garbage(fcp, deleted, &gc_list);
120	}	155	}
121		156
122	static void flow_new_hash_rnd(int cpu)	157	static void flow_cache_shrink(struct flow_cache *fc,
		158	struct flow_cache_percpu *fcp)
123	{	159	{
124	get_random_bytes(&flow_hash_rnd(cpu), sizeof(u32));	160	int shrink_to = fc->low_watermark / flow_cache_hash_size(fc);
125	flow_hash_rnd_recalc(cpu) = 0;
126		161
127	__flow_cache_shrink(cpu, 0);	162	__flow_cache_shrink(fc, fcp, shrink_to);
128	}	163	}
129		164
130	static u32 flow_hash_code(struct flowi *key, int cpu)	165	static void flow_new_hash_rnd(struct flow_cache *fc,
		166	struct flow_cache_percpu *fcp)
		167	{
		168	get_random_bytes(&fcp->hash_rnd, sizeof(u32));
		169	fcp->hash_rnd_recalc = 0;
		170	__flow_cache_shrink(fc, fcp, 0);
		171	}
		172
		173	static u32 flow_hash_code(struct flow_cache *fc,
		174	struct flow_cache_percpu *fcp,
		175	struct flowi *key)
131	{	176	{
132	u32 k = (u32 ) key;	177	u32 k = (u32 ) key;
133		178
134	return (jhash2(k, (sizeof(*key) / sizeof(u32)), flow_hash_rnd(cpu)) &	179	return (jhash2(k, (sizeof(*key) / sizeof(u32)), fcp->hash_rnd)
135	(flow_hash_size - 1));	180	& (flow_cache_hash_size(fc) - 1));
136	}	181	}
137		182
138	#if (BITS_PER_LONG == 64)	183	#if (BITS_PER_LONG == 64)
@@ -165,114 +210,117 @@ static int flow_key_compare(struct flowi key1, struct flowi key2)
165	return 0;	210	return 0;
166	}	211	}
167		212
168	void flow_cache_lookup(struct net net, struct flowi *key, u16 family, u8 dir,	213	struct flow_cache_object *
169	flow_resolve_t resolver)	214	flow_cache_lookup(struct net net, struct flowi key, u16 family, u8 dir,
		215	flow_resolve_t resolver, void *ctx)
170	{	216	{
171	struct flow_cache_entry fle, *head;	217	struct flow_cache *fc = &flow_cache_global;
		218	struct flow_cache_percpu *fcp;
		219	struct flow_cache_entry fle, tfle;
		220	struct hlist_node *entry;
		221	struct flow_cache_object *flo;
172	unsigned int hash;	222	unsigned int hash;
173	int cpu;
174		223
175	local_bh_disable();	224	local_bh_disable();
176	cpu = smp_processor_id();	225	fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
177		226
178	fle = NULL;	227	fle = NULL;
		228	flo = NULL;
179	/* Packet really early in init? Making flow_cache_init a	229	/* Packet really early in init? Making flow_cache_init a
180	* pre-smp initcall would solve this. --RR */	230	* pre-smp initcall would solve this. --RR */
181	if (!flow_table(cpu))	231	if (!fcp->hash_table)
182	goto nocache;	232	goto nocache;
183		233
184	if (flow_hash_rnd_recalc(cpu))	234	if (fcp->hash_rnd_recalc)
185	flow_new_hash_rnd(cpu);	235	flow_new_hash_rnd(fc, fcp);
186	hash = flow_hash_code(key, cpu);
187		236
188	head = &flow_table(cpu)[hash];	237	hash = flow_hash_code(fc, fcp, key);
189	for (fle = *head; fle; fle = fle->next) {	238	hlist_for_each_entry(tfle, entry, &fcp->hash_table[hash], u.hlist) {
190	if (fle->family == family &&	239	if (tfle->family == family &&
191	fle->dir == dir &&	240	tfle->dir == dir &&
192	flow_key_compare(key, &fle->key) == 0) {	241	flow_key_compare(key, &tfle->key) == 0) {
193	if (fle->genid == atomic_read(&flow_cache_genid)) {	242	fle = tfle;
194	void *ret = fle->object;
195
196	if (ret)
197	atomic_inc(fle->object_ref);
198	local_bh_enable();
199
200	return ret;
201	}
202	break;	243	break;
203	}	244	}
204	}	245	}
205		246
206	if (!fle) {	247	if (unlikely(!fle)) {
207	if (flow_count(cpu) > flow_hwm)	248	if (fcp->hash_count > fc->high_watermark)
208	flow_cache_shrink(cpu);	249	flow_cache_shrink(fc, fcp);
209		250
210	fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);	251	fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
211	if (fle) {	252	if (fle) {
212	fle->next = *head;
213	*head = fle;
214	fle->family = family;	253	fle->family = family;
215	fle->dir = dir;	254	fle->dir = dir;
216	memcpy(&fle->key, key, sizeof(*key));	255	memcpy(&fle->key, key, sizeof(*key));
217	fle->object = NULL;	256	fle->object = NULL;
218	flow_count(cpu)++;	257	hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]);
		258	fcp->hash_count++;
219	}	259	}
		260	} else if (likely(fle->genid == atomic_read(&flow_cache_genid))) {
		261	flo = fle->object;
		262	if (!flo)
		263	goto ret_object;
		264	flo = flo->ops->get(flo);
		265	if (flo)
		266	goto ret_object;
		267	} else if (fle->object) {
		268	flo = fle->object;
		269	flo->ops->delete(flo);
		270	fle->object = NULL;
220	}	271	}
221		272
222	nocache:	273	nocache:
223	{	274	flo = NULL;
224	int err;	275	if (fle) {
225	void *obj;	276	flo = fle->object;
226	atomic_t *obj_ref;	277	fle->object = NULL;
227
228	err = resolver(net, key, family, dir, &obj, &obj_ref);
229
230	if (fle && !err) {
231	fle->genid = atomic_read(&flow_cache_genid);
232
233	if (fle->object)
234	atomic_dec(fle->object_ref);
235
236	fle->object = obj;
237	fle->object_ref = obj_ref;
238	if (obj)
239	atomic_inc(fle->object_ref);
240	}
241	local_bh_enable();
242
243	if (err)
244	obj = ERR_PTR(err);
245	return obj;
246	}	278	}
		279	flo = resolver(net, key, family, dir, flo, ctx);
		280	if (fle) {
		281	fle->genid = atomic_read(&flow_cache_genid);
		282	if (!IS_ERR(flo))
		283	fle->object = flo;
		284	else
		285	fle->genid--;
		286	} else {
		287	if (flo && !IS_ERR(flo))
		288	flo->ops->delete(flo);
		289	}
		290	ret_object:
		291	local_bh_enable();
		292	return flo;
247	}	293	}
248		294
249	static void flow_cache_flush_tasklet(unsigned long data)	295	static void flow_cache_flush_tasklet(unsigned long data)
250	{	296	{
251	struct flow_flush_info info = (void )data;	297	struct flow_flush_info info = (void )data;
252	int i;	298	struct flow_cache *fc = info->cache;
253	int cpu;	299	struct flow_cache_percpu *fcp;
254		300	struct flow_cache_entry *fle;
255	cpu = smp_processor_id();	301	struct hlist_node entry, tmp;
256	for (i = 0; i < flow_hash_size; i++) {	302	LIST_HEAD(gc_list);
257	struct flow_cache_entry *fle;	303	int i, deleted = 0;
258		304
259	fle = flow_table(cpu)[i];	305	fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
260	for (; fle; fle = fle->next) {	306	for (i = 0; i < flow_cache_hash_size(fc); i++) {
261	unsigned genid = atomic_read(&flow_cache_genid);	307	hlist_for_each_entry_safe(fle, entry, tmp,
262		308	&fcp->hash_table[i], u.hlist) {
263	if (!fle->object \|\| fle->genid == genid)	309	if (flow_entry_valid(fle))
264	continue;	310	continue;
265		311
266	fle->object = NULL;	312	deleted++;
267	atomic_dec(fle->object_ref);	313	hlist_del(&fle->u.hlist);
		314	list_add_tail(&fle->u.gc_list, &gc_list);
268	}	315	}
269	}	316	}
270		317
		318	flow_cache_queue_garbage(fcp, deleted, &gc_list);
		319
271	if (atomic_dec_and_test(&info->cpuleft))	320	if (atomic_dec_and_test(&info->cpuleft))
272	complete(&info->completion);	321	complete(&info->completion);
273	}	322	}
274		323
275	static void flow_cache_flush_per_cpu(void *) __attribute__((__unused__));
276	static void flow_cache_flush_per_cpu(void *data)	324	static void flow_cache_flush_per_cpu(void *data)
277	{	325	{
278	struct flow_flush_info *info = data;	326	struct flow_flush_info *info = data;
@@ -280,8 +328,7 @@ static void flow_cache_flush_per_cpu(void *data)
280	struct tasklet_struct *tasklet;	328	struct tasklet_struct *tasklet;
281		329
282	cpu = smp_processor_id();	330	cpu = smp_processor_id();
283		331	tasklet = &per_cpu_ptr(info->cache->percpu, cpu)->flush_tasklet;
284	tasklet = flow_flush_tasklet(cpu);
285	tasklet->data = (unsigned long)info;	332	tasklet->data = (unsigned long)info;
286	tasklet_schedule(tasklet);	333	tasklet_schedule(tasklet);
287	}	334	}
@@ -294,6 +341,7 @@ void flow_cache_flush(void)
294	/* Don't want cpus going down or up during this. */	341	/* Don't want cpus going down or up during this. */
295	get_online_cpus();	342	get_online_cpus();
296	mutex_lock(&flow_flush_sem);	343	mutex_lock(&flow_flush_sem);
		344	info.cache = &flow_cache_global;
297	atomic_set(&info.cpuleft, num_online_cpus());	345	atomic_set(&info.cpuleft, num_online_cpus());
298	init_completion(&info.completion);	346	init_completion(&info.completion);
299		347
@@ -307,62 +355,75 @@ void flow_cache_flush(void)
307	put_online_cpus();	355	put_online_cpus();
308	}	356	}
309		357
310	static void __init flow_cache_cpu_prepare(int cpu)	358	static void __init flow_cache_cpu_prepare(struct flow_cache *fc,
		359	struct flow_cache_percpu *fcp)
311	{	360	{
312	struct tasklet_struct *tasklet;	361	fcp->hash_table = (struct hlist_head *)
313	unsigned long order;	362	__get_free_pages(GFP_KERNEL\|__GFP_ZERO, fc->order);
314		363	if (!fcp->hash_table)
315	for (order = 0;	364	panic("NET: failed to allocate flow cache order %lu\n", fc->order);
316	(PAGE_SIZE << order) <	365
317	(sizeof(struct flow_cache_entry )flow_hash_size);	366	fcp->hash_rnd_recalc = 1;
318	order++)	367	fcp->hash_count = 0;
319	/* NOTHING */;	368	tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0);
320
321	flow_table(cpu) = (struct flow_cache_entry **)
322	__get_free_pages(GFP_KERNEL\|__GFP_ZERO, order);
323	if (!flow_table(cpu))
324	panic("NET: failed to allocate flow cache order %lu\n", order);
325
326	flow_hash_rnd_recalc(cpu) = 1;
327	flow_count(cpu) = 0;
328
329	tasklet = flow_flush_tasklet(cpu);
330	tasklet_init(tasklet, flow_cache_flush_tasklet, 0);
331	}	369	}
332		370
333	static int flow_cache_cpu(struct notifier_block *nfb,	371	static int flow_cache_cpu(struct notifier_block *nfb,
334	unsigned long action,	372	unsigned long action,
335	void *hcpu)	373	void *hcpu)
336	{	374	{
		375	struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier);
		376	int cpu = (unsigned long) hcpu;
		377	struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
		378
337	if (action == CPU_DEAD \|\| action == CPU_DEAD_FROZEN)	379	if (action == CPU_DEAD \|\| action == CPU_DEAD_FROZEN)
338	__flow_cache_shrink((unsigned long)hcpu, 0);	380	__flow_cache_shrink(fc, fcp, 0);
339	return NOTIFY_OK;	381	return NOTIFY_OK;
340	}	382	}
341		383
342	static int __init flow_cache_init(void)	384	static int flow_cache_init(struct flow_cache *fc)
343	{	385	{
		386	unsigned long order;
344	int i;	387	int i;
345		388
346	flow_cachep = kmem_cache_create("flow_cache",	389	fc->hash_shift = 10;
347	sizeof(struct flow_cache_entry),	390	fc->low_watermark = 2 * flow_cache_hash_size(fc);
348	0, SLAB_PANIC,	391	fc->high_watermark = 4 * flow_cache_hash_size(fc);
349	NULL);	392
350	flow_hash_shift = 10;	393	for (order = 0;
351	flow_lwm = 2 * flow_hash_size;	394	(PAGE_SIZE << order) <
352	flow_hwm = 4 * flow_hash_size;	395	(sizeof(struct hlist_head)*flow_cache_hash_size(fc));
		396	order++)
		397	/* NOTHING */;
		398	fc->order = order;
		399	fc->percpu = alloc_percpu(struct flow_cache_percpu);
353		400
354	setup_timer(&flow_hash_rnd_timer, flow_cache_new_hashrnd, 0);	401	setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
355	flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;	402	(unsigned long) fc);
356	add_timer(&flow_hash_rnd_timer);	403	fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
		404	add_timer(&fc->rnd_timer);
357		405
358	for_each_possible_cpu(i)	406	for_each_possible_cpu(i)
359	flow_cache_cpu_prepare(i);	407	flow_cache_cpu_prepare(fc, per_cpu_ptr(fc->percpu, i));
		408
		409	fc->hotcpu_notifier = (struct notifier_block){
		410	.notifier_call = flow_cache_cpu,
		411	};
		412	register_hotcpu_notifier(&fc->hotcpu_notifier);
360		413
361	hotcpu_notifier(flow_cache_cpu, 0);
362	return 0;	414	return 0;
363	}	415	}
364		416
365	module_init(flow_cache_init);	417	static int __init flow_cache_init_global(void)
		418	{
		419	flow_cachep = kmem_cache_create("flow_cache",
		420	sizeof(struct flow_cache_entry),
		421	0, SLAB_PANIC, NULL);
		422
		423	return flow_cache_init(&flow_cache_global);
		424	}
		425
		426	module_init(flow_cache_init_global);
366		427
367	EXPORT_SYMBOL(flow_cache_genid);	428	EXPORT_SYMBOL(flow_cache_genid);
368	EXPORT_SYMBOL(flow_cache_lookup);	429	EXPORT_SYMBOL(flow_cache_lookup);