diff options
author | Eric Dumazet <edumazet@google.com> | 2014-09-26 02:04:56 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2014-09-29 00:04:55 -0400 |
commit | 3d9a0d2f8212879407e58d67f460d8920eb6543d (patch) | |
tree | 2f21d1f2173c017fabddec9009de8aba855b7c22 | |
parent | 68f6a7c6c9817f2e6a66b59893de3c901ae5608c (diff) |
dql: dql_queued() should write first to reduce bus transactions
While doing high throughput test on a BQL enabled NIC,
I found a very high cost in ndo_start_xmit() when accessing BQL data.
It turned out the problem was caused by compiler trying to be
smart, but involving a bad MESI transaction :
0.05 │ mov 0xc0(%rax),%edi // LOAD dql->num_queued
0.48 │ mov %edx,0xc8(%rax) // STORE dql->last_obj_cnt = count
58.23 │ add %edx,%edi
0.58 │ cmp %edi,0xc4(%rax)
0.76 │ mov %edi,0xc0(%rax) // STORE dql->num_queued += count
0.72 │ js bd8
I got an incredible 10 % gain [1] by making sure cpu do not attempt
to get the cache line in Shared mode, but directly requests for
ownership.
New code :
mov %edx,0xc8(%rax) // STORE dql->last_obj_cnt = count
add %edx,0xc0(%rax) // RMW dql->num_queued += count
mov 0xc4(%rax),%ecx // LOAD dql->adj_limit
mov 0xc0(%rax),%edx // LOAD dql->num_queued
cmp %edx,%ecx
The TX completion was running from another cpu, with high interrupts
rate.
Note that I am using barrier() as a soft hint, as mb() here could be
too heavy cost.
[1] This was a netperf TCP_STREAM with TSO disabled, but GSO enabled.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/linux/dynamic_queue_limits.h | 12 |
1 files changed, 10 insertions, 2 deletions
diff --git a/include/linux/dynamic_queue_limits.h b/include/linux/dynamic_queue_limits.h index 5621547d631b..a4be70398ce1 100644 --- a/include/linux/dynamic_queue_limits.h +++ b/include/linux/dynamic_queue_limits.h | |||
@@ -73,14 +73,22 @@ static inline void dql_queued(struct dql *dql, unsigned int count) | |||
73 | { | 73 | { |
74 | BUG_ON(count > DQL_MAX_OBJECT); | 74 | BUG_ON(count > DQL_MAX_OBJECT); |
75 | 75 | ||
76 | dql->num_queued += count; | ||
77 | dql->last_obj_cnt = count; | 76 | dql->last_obj_cnt = count; |
77 | |||
78 | /* We want to force a write first, so that cpu do not attempt | ||
79 | * to get cache line containing last_obj_cnt, num_queued, adj_limit | ||
80 | * in Shared state, but directly does a Request For Ownership | ||
81 | * It is only a hint, we use barrier() only. | ||
82 | */ | ||
83 | barrier(); | ||
84 | |||
85 | dql->num_queued += count; | ||
78 | } | 86 | } |
79 | 87 | ||
80 | /* Returns how many objects can be queued, < 0 indicates over limit. */ | 88 | /* Returns how many objects can be queued, < 0 indicates over limit. */ |
81 | static inline int dql_avail(const struct dql *dql) | 89 | static inline int dql_avail(const struct dql *dql) |
82 | { | 90 | { |
83 | return dql->adj_limit - dql->num_queued; | 91 | return ACCESS_ONCE(dql->adj_limit) - ACCESS_ONCE(dql->num_queued); |
84 | } | 92 | } |
85 | 93 | ||
86 | /* Record number of completed objects and recalculate the limit. */ | 94 | /* Record number of completed objects and recalculate the limit. */ |