aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFlorian Westphal <fw@strlen.de>2016-11-04 11:54:58 -0400
committerPablo Neira Ayuso <pablo@netfilter.org>2016-11-08 17:53:38 -0500
commite0df8cae6c16b9ba66a005079aa754b9eedc6efa (patch)
tree50ef249706b3daf0d356b643ad2644b9c92cb59a
parent6114cc516dcc0d311badb83ad7db5aa4b611bea6 (diff)
netfilter: conntrack: refine gc worker heuristics
Nicolas Dichtel says: After commit b87a2f9199ea ("netfilter: conntrack: add gc worker to remove timed-out entries"), netlink conntrack deletion events may be sent with a huge delay. Nicolas further points at this line: goal = min(nf_conntrack_htable_size / GC_MAX_BUCKETS_DIV, GC_MAX_BUCKETS); and indeed, this isn't optimal at all. Rationale here was to ensure that we don't block other work items for too long, even if nf_conntrack_htable_size is huge. But in order to have some guarantee about maximum time period where a scan of the full conntrack table completes we should always use a fixed slice size, so that once every N scans the full table has been examined at least once. We also need to balance this vs. the case where the system is either idle (i.e., conntrack table (almost) empty) or very busy (i.e. eviction happens from packet path). So, after some discussion with Nicolas: 1. want hard guarantee that we scan entire table at least once every X s -> need to scan fraction of table (get rid of upper bound) 2. don't want to eat cycles on idle or very busy system -> increase interval if we did not evict any entries 3. don't want to block other worker items for too long -> make fraction really small, and prefer small scan interval instead 4. Want reasonable short time where we detect timed-out entry when system went idle after a burst of traffic, while not doing scans all the time. -> Store next gc scan in worker, increasing delays when no eviction happened and shrinking delay when we see timed out entries. The old gc interval is turned into a max number, scans can now happen every jiffy if stale entries are present. Longest possible time period until an entry is evicted is now 2 minutes in worst case (entry expires right after it was deemed 'not expired'). Reported-by: Nicolas Dichtel <nicolas.dichtel@6wind.com> Signed-off-by: Florian Westphal <fw@strlen.de> Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
-rw-r--r--net/netfilter/nf_conntrack_core.c49
1 files changed, 41 insertions, 8 deletions
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index df2f5a3901df..0f87e5d21be7 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -76,6 +76,7 @@ struct conntrack_gc_work {
76 struct delayed_work dwork; 76 struct delayed_work dwork;
77 u32 last_bucket; 77 u32 last_bucket;
78 bool exiting; 78 bool exiting;
79 long next_gc_run;
79}; 80};
80 81
81static __read_mostly struct kmem_cache *nf_conntrack_cachep; 82static __read_mostly struct kmem_cache *nf_conntrack_cachep;
@@ -83,9 +84,11 @@ static __read_mostly spinlock_t nf_conntrack_locks_all_lock;
83static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); 84static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
84static __read_mostly bool nf_conntrack_locks_all; 85static __read_mostly bool nf_conntrack_locks_all;
85 86
87/* every gc cycle scans at most 1/GC_MAX_BUCKETS_DIV part of table */
86#define GC_MAX_BUCKETS_DIV 64u 88#define GC_MAX_BUCKETS_DIV 64u
87#define GC_MAX_BUCKETS 8192u 89/* upper bound of scan intervals */
88#define GC_INTERVAL (5 * HZ) 90#define GC_INTERVAL_MAX (2 * HZ)
91/* maximum conntracks to evict per gc run */
89#define GC_MAX_EVICTS 256u 92#define GC_MAX_EVICTS 256u
90 93
91static struct conntrack_gc_work conntrack_gc_work; 94static struct conntrack_gc_work conntrack_gc_work;
@@ -936,13 +939,13 @@ static noinline int early_drop(struct net *net, unsigned int _hash)
936static void gc_worker(struct work_struct *work) 939static void gc_worker(struct work_struct *work)
937{ 940{
938 unsigned int i, goal, buckets = 0, expired_count = 0; 941 unsigned int i, goal, buckets = 0, expired_count = 0;
939 unsigned long next_run = GC_INTERVAL;
940 unsigned int ratio, scanned = 0;
941 struct conntrack_gc_work *gc_work; 942 struct conntrack_gc_work *gc_work;
943 unsigned int ratio, scanned = 0;
944 unsigned long next_run;
942 945
943 gc_work = container_of(work, struct conntrack_gc_work, dwork.work); 946 gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
944 947
945 goal = min(nf_conntrack_htable_size / GC_MAX_BUCKETS_DIV, GC_MAX_BUCKETS); 948 goal = nf_conntrack_htable_size / GC_MAX_BUCKETS_DIV;
946 i = gc_work->last_bucket; 949 i = gc_work->last_bucket;
947 950
948 do { 951 do {
@@ -982,17 +985,47 @@ static void gc_worker(struct work_struct *work)
982 if (gc_work->exiting) 985 if (gc_work->exiting)
983 return; 986 return;
984 987
988 /*
989 * Eviction will normally happen from the packet path, and not
990 * from this gc worker.
991 *
992 * This worker is only here to reap expired entries when system went
993 * idle after a busy period.
994 *
995 * The heuristics below are supposed to balance conflicting goals:
996 *
997 * 1. Minimize time until we notice a stale entry
998 * 2. Maximize scan intervals to not waste cycles
999 *
1000 * Normally, expired_count will be 0, this increases the next_run time
1001 * to priorize 2) above.
1002 *
1003 * As soon as a timed-out entry is found, move towards 1) and increase
1004 * the scan frequency.
1005 * In case we have lots of evictions next scan is done immediately.
1006 */
985 ratio = scanned ? expired_count * 100 / scanned : 0; 1007 ratio = scanned ? expired_count * 100 / scanned : 0;
986 if (ratio >= 90 || expired_count == GC_MAX_EVICTS) 1008 if (ratio >= 90 || expired_count == GC_MAX_EVICTS) {
1009 gc_work->next_gc_run = 0;
987 next_run = 0; 1010 next_run = 0;
1011 } else if (expired_count) {
1012 gc_work->next_gc_run /= 2U;
1013 next_run = msecs_to_jiffies(1);
1014 } else {
1015 if (gc_work->next_gc_run < GC_INTERVAL_MAX)
1016 gc_work->next_gc_run += msecs_to_jiffies(1);
1017
1018 next_run = gc_work->next_gc_run;
1019 }
988 1020
989 gc_work->last_bucket = i; 1021 gc_work->last_bucket = i;
990 schedule_delayed_work(&gc_work->dwork, next_run); 1022 queue_delayed_work(system_long_wq, &gc_work->dwork, next_run);
991} 1023}
992 1024
993static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) 1025static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work)
994{ 1026{
995 INIT_DELAYED_WORK(&gc_work->dwork, gc_worker); 1027 INIT_DELAYED_WORK(&gc_work->dwork, gc_worker);
1028 gc_work->next_gc_run = GC_INTERVAL_MAX;
996 gc_work->exiting = false; 1029 gc_work->exiting = false;
997} 1030}
998 1031
@@ -1885,7 +1918,7 @@ int nf_conntrack_init_start(void)
1885 nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED); 1918 nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED);
1886 1919
1887 conntrack_gc_work_init(&conntrack_gc_work); 1920 conntrack_gc_work_init(&conntrack_gc_work);
1888 schedule_delayed_work(&conntrack_gc_work.dwork, GC_INTERVAL); 1921 queue_delayed_work(system_long_wq, &conntrack_gc_work.dwork, GC_INTERVAL_MAX);
1889 1922
1890 return 0; 1923 return 0;
1891 1924