diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2010-10-23 14:47:02 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-10-23 14:47:02 -0400 |
commit | 5f05647dd81c11a6a165ccc8f0c1370b16f3bcb0 (patch) | |
tree | 7851ef1c93aa1aba7ef327ca4b75fd35e6d10f29 /net/rds/ib_recv.c | |
parent | 02f36038c568111ad4fc433f6fa760ff5e38fab4 (diff) | |
parent | ec37a48d1d16c30b655ac5280209edf52a6775d4 (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next-2.6
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next-2.6: (1699 commits)
bnx2/bnx2x: Unsupported Ethtool operations should return -EINVAL.
vlan: Calling vlan_hwaccel_do_receive() is always valid.
tproxy: use the interface primary IP address as a default value for --on-ip
tproxy: added IPv6 support to the socket match
cxgb3: function namespace cleanup
tproxy: added IPv6 support to the TPROXY target
tproxy: added IPv6 socket lookup function to nf_tproxy_core
be2net: Changes to use only priority codes allowed by f/w
tproxy: allow non-local binds of IPv6 sockets if IP_TRANSPARENT is enabled
tproxy: added tproxy sockopt interface in the IPV6 layer
tproxy: added udp6_lib_lookup function
tproxy: added const specifiers to udp lookup functions
tproxy: split off ipv6 defragmentation to a separate module
l2tp: small cleanup
nf_nat: restrict ICMP translation for embedded header
can: mcp251x: fix generation of error frames
can: mcp251x: fix endless loop in interrupt handler if CANINTF_MERRF is set
can-raw: add msg_flags to distinguish local traffic
9p: client code cleanup
rds: make local functions/variables static
...
Fix up conflicts in net/core/dev.c, drivers/net/pcmcia/smc91c92_cs.c and
drivers/net/wireless/ath/ath9k/debug.c as per David
Diffstat (limited to 'net/rds/ib_recv.c')
-rw-r--r-- | net/rds/ib_recv.c | 549 |
1 files changed, 332 insertions, 217 deletions
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index c74e9904a6b2..e29e0ca32f74 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c | |||
@@ -43,42 +43,6 @@ static struct kmem_cache *rds_ib_incoming_slab; | |||
43 | static struct kmem_cache *rds_ib_frag_slab; | 43 | static struct kmem_cache *rds_ib_frag_slab; |
44 | static atomic_t rds_ib_allocation = ATOMIC_INIT(0); | 44 | static atomic_t rds_ib_allocation = ATOMIC_INIT(0); |
45 | 45 | ||
46 | static void rds_ib_frag_drop_page(struct rds_page_frag *frag) | ||
47 | { | ||
48 | rdsdebug("frag %p page %p\n", frag, frag->f_page); | ||
49 | __free_page(frag->f_page); | ||
50 | frag->f_page = NULL; | ||
51 | } | ||
52 | |||
53 | static void rds_ib_frag_free(struct rds_page_frag *frag) | ||
54 | { | ||
55 | rdsdebug("frag %p page %p\n", frag, frag->f_page); | ||
56 | BUG_ON(frag->f_page != NULL); | ||
57 | kmem_cache_free(rds_ib_frag_slab, frag); | ||
58 | } | ||
59 | |||
60 | /* | ||
61 | * We map a page at a time. Its fragments are posted in order. This | ||
62 | * is called in fragment order as the fragments get send completion events. | ||
63 | * Only the last frag in the page performs the unmapping. | ||
64 | * | ||
65 | * It's OK for ring cleanup to call this in whatever order it likes because | ||
66 | * DMA is not in flight and so we can unmap while other ring entries still | ||
67 | * hold page references in their frags. | ||
68 | */ | ||
69 | static void rds_ib_recv_unmap_page(struct rds_ib_connection *ic, | ||
70 | struct rds_ib_recv_work *recv) | ||
71 | { | ||
72 | struct rds_page_frag *frag = recv->r_frag; | ||
73 | |||
74 | rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page); | ||
75 | if (frag->f_mapped) | ||
76 | ib_dma_unmap_page(ic->i_cm_id->device, | ||
77 | frag->f_mapped, | ||
78 | RDS_FRAG_SIZE, DMA_FROM_DEVICE); | ||
79 | frag->f_mapped = 0; | ||
80 | } | ||
81 | |||
82 | void rds_ib_recv_init_ring(struct rds_ib_connection *ic) | 46 | void rds_ib_recv_init_ring(struct rds_ib_connection *ic) |
83 | { | 47 | { |
84 | struct rds_ib_recv_work *recv; | 48 | struct rds_ib_recv_work *recv; |
@@ -95,16 +59,161 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic) | |||
95 | recv->r_wr.sg_list = recv->r_sge; | 59 | recv->r_wr.sg_list = recv->r_sge; |
96 | recv->r_wr.num_sge = RDS_IB_RECV_SGE; | 60 | recv->r_wr.num_sge = RDS_IB_RECV_SGE; |
97 | 61 | ||
98 | sge = rds_ib_data_sge(ic, recv->r_sge); | 62 | sge = &recv->r_sge[0]; |
63 | sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); | ||
64 | sge->length = sizeof(struct rds_header); | ||
65 | sge->lkey = ic->i_mr->lkey; | ||
66 | |||
67 | sge = &recv->r_sge[1]; | ||
99 | sge->addr = 0; | 68 | sge->addr = 0; |
100 | sge->length = RDS_FRAG_SIZE; | 69 | sge->length = RDS_FRAG_SIZE; |
101 | sge->lkey = ic->i_mr->lkey; | 70 | sge->lkey = ic->i_mr->lkey; |
71 | } | ||
72 | } | ||
102 | 73 | ||
103 | sge = rds_ib_header_sge(ic, recv->r_sge); | 74 | /* |
104 | sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); | 75 | * The entire 'from' list, including the from element itself, is put on |
105 | sge->length = sizeof(struct rds_header); | 76 | * to the tail of the 'to' list. |
106 | sge->lkey = ic->i_mr->lkey; | 77 | */ |
78 | static void list_splice_entire_tail(struct list_head *from, | ||
79 | struct list_head *to) | ||
80 | { | ||
81 | struct list_head *from_last = from->prev; | ||
82 | |||
83 | list_splice_tail(from_last, to); | ||
84 | list_add_tail(from_last, to); | ||
85 | } | ||
86 | |||
87 | static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache) | ||
88 | { | ||
89 | struct list_head *tmp; | ||
90 | |||
91 | tmp = xchg(&cache->xfer, NULL); | ||
92 | if (tmp) { | ||
93 | if (cache->ready) | ||
94 | list_splice_entire_tail(tmp, cache->ready); | ||
95 | else | ||
96 | cache->ready = tmp; | ||
97 | } | ||
98 | } | ||
99 | |||
100 | static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache) | ||
101 | { | ||
102 | struct rds_ib_cache_head *head; | ||
103 | int cpu; | ||
104 | |||
105 | cache->percpu = alloc_percpu(struct rds_ib_cache_head); | ||
106 | if (!cache->percpu) | ||
107 | return -ENOMEM; | ||
108 | |||
109 | for_each_possible_cpu(cpu) { | ||
110 | head = per_cpu_ptr(cache->percpu, cpu); | ||
111 | head->first = NULL; | ||
112 | head->count = 0; | ||
113 | } | ||
114 | cache->xfer = NULL; | ||
115 | cache->ready = NULL; | ||
116 | |||
117 | return 0; | ||
118 | } | ||
119 | |||
120 | int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic) | ||
121 | { | ||
122 | int ret; | ||
123 | |||
124 | ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs); | ||
125 | if (!ret) { | ||
126 | ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags); | ||
127 | if (ret) | ||
128 | free_percpu(ic->i_cache_incs.percpu); | ||
107 | } | 129 | } |
130 | |||
131 | return ret; | ||
132 | } | ||
133 | |||
134 | static void rds_ib_cache_splice_all_lists(struct rds_ib_refill_cache *cache, | ||
135 | struct list_head *caller_list) | ||
136 | { | ||
137 | struct rds_ib_cache_head *head; | ||
138 | int cpu; | ||
139 | |||
140 | for_each_possible_cpu(cpu) { | ||
141 | head = per_cpu_ptr(cache->percpu, cpu); | ||
142 | if (head->first) { | ||
143 | list_splice_entire_tail(head->first, caller_list); | ||
144 | head->first = NULL; | ||
145 | } | ||
146 | } | ||
147 | |||
148 | if (cache->ready) { | ||
149 | list_splice_entire_tail(cache->ready, caller_list); | ||
150 | cache->ready = NULL; | ||
151 | } | ||
152 | } | ||
153 | |||
154 | void rds_ib_recv_free_caches(struct rds_ib_connection *ic) | ||
155 | { | ||
156 | struct rds_ib_incoming *inc; | ||
157 | struct rds_ib_incoming *inc_tmp; | ||
158 | struct rds_page_frag *frag; | ||
159 | struct rds_page_frag *frag_tmp; | ||
160 | LIST_HEAD(list); | ||
161 | |||
162 | rds_ib_cache_xfer_to_ready(&ic->i_cache_incs); | ||
163 | rds_ib_cache_splice_all_lists(&ic->i_cache_incs, &list); | ||
164 | free_percpu(ic->i_cache_incs.percpu); | ||
165 | |||
166 | list_for_each_entry_safe(inc, inc_tmp, &list, ii_cache_entry) { | ||
167 | list_del(&inc->ii_cache_entry); | ||
168 | WARN_ON(!list_empty(&inc->ii_frags)); | ||
169 | kmem_cache_free(rds_ib_incoming_slab, inc); | ||
170 | } | ||
171 | |||
172 | rds_ib_cache_xfer_to_ready(&ic->i_cache_frags); | ||
173 | rds_ib_cache_splice_all_lists(&ic->i_cache_frags, &list); | ||
174 | free_percpu(ic->i_cache_frags.percpu); | ||
175 | |||
176 | list_for_each_entry_safe(frag, frag_tmp, &list, f_cache_entry) { | ||
177 | list_del(&frag->f_cache_entry); | ||
178 | WARN_ON(!list_empty(&frag->f_item)); | ||
179 | kmem_cache_free(rds_ib_frag_slab, frag); | ||
180 | } | ||
181 | } | ||
182 | |||
183 | /* fwd decl */ | ||
184 | static void rds_ib_recv_cache_put(struct list_head *new_item, | ||
185 | struct rds_ib_refill_cache *cache); | ||
186 | static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache); | ||
187 | |||
188 | |||
189 | /* Recycle frag and attached recv buffer f_sg */ | ||
190 | static void rds_ib_frag_free(struct rds_ib_connection *ic, | ||
191 | struct rds_page_frag *frag) | ||
192 | { | ||
193 | rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg)); | ||
194 | |||
195 | rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags); | ||
196 | } | ||
197 | |||
198 | /* Recycle inc after freeing attached frags */ | ||
199 | void rds_ib_inc_free(struct rds_incoming *inc) | ||
200 | { | ||
201 | struct rds_ib_incoming *ibinc; | ||
202 | struct rds_page_frag *frag; | ||
203 | struct rds_page_frag *pos; | ||
204 | struct rds_ib_connection *ic = inc->i_conn->c_transport_data; | ||
205 | |||
206 | ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); | ||
207 | |||
208 | /* Free attached frags */ | ||
209 | list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) { | ||
210 | list_del_init(&frag->f_item); | ||
211 | rds_ib_frag_free(ic, frag); | ||
212 | } | ||
213 | BUG_ON(!list_empty(&ibinc->ii_frags)); | ||
214 | |||
215 | rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc); | ||
216 | rds_ib_recv_cache_put(&ibinc->ii_cache_entry, &ic->i_cache_incs); | ||
108 | } | 217 | } |
109 | 218 | ||
110 | static void rds_ib_recv_clear_one(struct rds_ib_connection *ic, | 219 | static void rds_ib_recv_clear_one(struct rds_ib_connection *ic, |
@@ -115,10 +224,8 @@ static void rds_ib_recv_clear_one(struct rds_ib_connection *ic, | |||
115 | recv->r_ibinc = NULL; | 224 | recv->r_ibinc = NULL; |
116 | } | 225 | } |
117 | if (recv->r_frag) { | 226 | if (recv->r_frag) { |
118 | rds_ib_recv_unmap_page(ic, recv); | 227 | ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE); |
119 | if (recv->r_frag->f_page) | 228 | rds_ib_frag_free(ic, recv->r_frag); |
120 | rds_ib_frag_drop_page(recv->r_frag); | ||
121 | rds_ib_frag_free(recv->r_frag); | ||
122 | recv->r_frag = NULL; | 229 | recv->r_frag = NULL; |
123 | } | 230 | } |
124 | } | 231 | } |
@@ -129,84 +236,111 @@ void rds_ib_recv_clear_ring(struct rds_ib_connection *ic) | |||
129 | 236 | ||
130 | for (i = 0; i < ic->i_recv_ring.w_nr; i++) | 237 | for (i = 0; i < ic->i_recv_ring.w_nr; i++) |
131 | rds_ib_recv_clear_one(ic, &ic->i_recvs[i]); | 238 | rds_ib_recv_clear_one(ic, &ic->i_recvs[i]); |
132 | |||
133 | if (ic->i_frag.f_page) | ||
134 | rds_ib_frag_drop_page(&ic->i_frag); | ||
135 | } | 239 | } |
136 | 240 | ||
137 | static int rds_ib_recv_refill_one(struct rds_connection *conn, | 241 | static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *ic, |
138 | struct rds_ib_recv_work *recv, | 242 | gfp_t slab_mask) |
139 | gfp_t kptr_gfp, gfp_t page_gfp) | ||
140 | { | 243 | { |
141 | struct rds_ib_connection *ic = conn->c_transport_data; | 244 | struct rds_ib_incoming *ibinc; |
142 | dma_addr_t dma_addr; | 245 | struct list_head *cache_item; |
143 | struct ib_sge *sge; | 246 | int avail_allocs; |
144 | int ret = -ENOMEM; | ||
145 | 247 | ||
146 | if (recv->r_ibinc == NULL) { | 248 | cache_item = rds_ib_recv_cache_get(&ic->i_cache_incs); |
147 | if (!atomic_add_unless(&rds_ib_allocation, 1, rds_ib_sysctl_max_recv_allocation)) { | 249 | if (cache_item) { |
250 | ibinc = container_of(cache_item, struct rds_ib_incoming, ii_cache_entry); | ||
251 | } else { | ||
252 | avail_allocs = atomic_add_unless(&rds_ib_allocation, | ||
253 | 1, rds_ib_sysctl_max_recv_allocation); | ||
254 | if (!avail_allocs) { | ||
148 | rds_ib_stats_inc(s_ib_rx_alloc_limit); | 255 | rds_ib_stats_inc(s_ib_rx_alloc_limit); |
149 | goto out; | 256 | return NULL; |
150 | } | 257 | } |
151 | recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab, | 258 | ibinc = kmem_cache_alloc(rds_ib_incoming_slab, slab_mask); |
152 | kptr_gfp); | 259 | if (!ibinc) { |
153 | if (recv->r_ibinc == NULL) { | ||
154 | atomic_dec(&rds_ib_allocation); | 260 | atomic_dec(&rds_ib_allocation); |
155 | goto out; | 261 | return NULL; |
156 | } | 262 | } |
157 | INIT_LIST_HEAD(&recv->r_ibinc->ii_frags); | ||
158 | rds_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr); | ||
159 | } | 263 | } |
264 | INIT_LIST_HEAD(&ibinc->ii_frags); | ||
265 | rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr); | ||
160 | 266 | ||
161 | if (recv->r_frag == NULL) { | 267 | return ibinc; |
162 | recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, kptr_gfp); | 268 | } |
163 | if (recv->r_frag == NULL) | 269 | |
164 | goto out; | 270 | static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic, |
165 | INIT_LIST_HEAD(&recv->r_frag->f_item); | 271 | gfp_t slab_mask, gfp_t page_mask) |
166 | recv->r_frag->f_page = NULL; | 272 | { |
273 | struct rds_page_frag *frag; | ||
274 | struct list_head *cache_item; | ||
275 | int ret; | ||
276 | |||
277 | cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags); | ||
278 | if (cache_item) { | ||
279 | frag = container_of(cache_item, struct rds_page_frag, f_cache_entry); | ||
280 | } else { | ||
281 | frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask); | ||
282 | if (!frag) | ||
283 | return NULL; | ||
284 | |||
285 | sg_init_table(&frag->f_sg, 1); | ||
286 | ret = rds_page_remainder_alloc(&frag->f_sg, | ||
287 | RDS_FRAG_SIZE, page_mask); | ||
288 | if (ret) { | ||
289 | kmem_cache_free(rds_ib_frag_slab, frag); | ||
290 | return NULL; | ||
291 | } | ||
167 | } | 292 | } |
168 | 293 | ||
169 | if (ic->i_frag.f_page == NULL) { | 294 | INIT_LIST_HEAD(&frag->f_item); |
170 | ic->i_frag.f_page = alloc_page(page_gfp); | 295 | |
171 | if (ic->i_frag.f_page == NULL) | 296 | return frag; |
172 | goto out; | 297 | } |
173 | ic->i_frag.f_offset = 0; | 298 | |
299 | static int rds_ib_recv_refill_one(struct rds_connection *conn, | ||
300 | struct rds_ib_recv_work *recv, int prefill) | ||
301 | { | ||
302 | struct rds_ib_connection *ic = conn->c_transport_data; | ||
303 | struct ib_sge *sge; | ||
304 | int ret = -ENOMEM; | ||
305 | gfp_t slab_mask = GFP_NOWAIT; | ||
306 | gfp_t page_mask = GFP_NOWAIT; | ||
307 | |||
308 | if (prefill) { | ||
309 | slab_mask = GFP_KERNEL; | ||
310 | page_mask = GFP_HIGHUSER; | ||
174 | } | 311 | } |
175 | 312 | ||
176 | dma_addr = ib_dma_map_page(ic->i_cm_id->device, | 313 | if (!ic->i_cache_incs.ready) |
177 | ic->i_frag.f_page, | 314 | rds_ib_cache_xfer_to_ready(&ic->i_cache_incs); |
178 | ic->i_frag.f_offset, | 315 | if (!ic->i_cache_frags.ready) |
179 | RDS_FRAG_SIZE, | 316 | rds_ib_cache_xfer_to_ready(&ic->i_cache_frags); |
180 | DMA_FROM_DEVICE); | ||
181 | if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr)) | ||
182 | goto out; | ||
183 | 317 | ||
184 | /* | 318 | /* |
185 | * Once we get the RDS_PAGE_LAST_OFF frag then rds_ib_frag_unmap() | 319 | * ibinc was taken from recv if recv contained the start of a message. |
186 | * must be called on this recv. This happens as completions hit | 320 | * recvs that were continuations will still have this allocated. |
187 | * in order or on connection shutdown. | ||
188 | */ | 321 | */ |
189 | recv->r_frag->f_page = ic->i_frag.f_page; | 322 | if (!recv->r_ibinc) { |
190 | recv->r_frag->f_offset = ic->i_frag.f_offset; | 323 | recv->r_ibinc = rds_ib_refill_one_inc(ic, slab_mask); |
191 | recv->r_frag->f_mapped = dma_addr; | 324 | if (!recv->r_ibinc) |
325 | goto out; | ||
326 | } | ||
192 | 327 | ||
193 | sge = rds_ib_data_sge(ic, recv->r_sge); | 328 | WARN_ON(recv->r_frag); /* leak! */ |
194 | sge->addr = dma_addr; | 329 | recv->r_frag = rds_ib_refill_one_frag(ic, slab_mask, page_mask); |
195 | sge->length = RDS_FRAG_SIZE; | 330 | if (!recv->r_frag) |
331 | goto out; | ||
332 | |||
333 | ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, | ||
334 | 1, DMA_FROM_DEVICE); | ||
335 | WARN_ON(ret != 1); | ||
196 | 336 | ||
197 | sge = rds_ib_header_sge(ic, recv->r_sge); | 337 | sge = &recv->r_sge[0]; |
198 | sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); | 338 | sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); |
199 | sge->length = sizeof(struct rds_header); | 339 | sge->length = sizeof(struct rds_header); |
200 | 340 | ||
201 | get_page(recv->r_frag->f_page); | 341 | sge = &recv->r_sge[1]; |
202 | 342 | sge->addr = sg_dma_address(&recv->r_frag->f_sg); | |
203 | if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) { | 343 | sge->length = sg_dma_len(&recv->r_frag->f_sg); |
204 | ic->i_frag.f_offset += RDS_FRAG_SIZE; | ||
205 | } else { | ||
206 | put_page(ic->i_frag.f_page); | ||
207 | ic->i_frag.f_page = NULL; | ||
208 | ic->i_frag.f_offset = 0; | ||
209 | } | ||
210 | 344 | ||
211 | ret = 0; | 345 | ret = 0; |
212 | out: | 346 | out: |
@@ -216,13 +350,11 @@ out: | |||
216 | /* | 350 | /* |
217 | * This tries to allocate and post unused work requests after making sure that | 351 | * This tries to allocate and post unused work requests after making sure that |
218 | * they have all the allocations they need to queue received fragments into | 352 | * they have all the allocations they need to queue received fragments into |
219 | * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc | 353 | * sockets. |
220 | * pairs don't go unmatched. | ||
221 | * | 354 | * |
222 | * -1 is returned if posting fails due to temporary resource exhaustion. | 355 | * -1 is returned if posting fails due to temporary resource exhaustion. |
223 | */ | 356 | */ |
224 | int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, | 357 | void rds_ib_recv_refill(struct rds_connection *conn, int prefill) |
225 | gfp_t page_gfp, int prefill) | ||
226 | { | 358 | { |
227 | struct rds_ib_connection *ic = conn->c_transport_data; | 359 | struct rds_ib_connection *ic = conn->c_transport_data; |
228 | struct rds_ib_recv_work *recv; | 360 | struct rds_ib_recv_work *recv; |
@@ -236,28 +368,25 @@ int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, | |||
236 | if (pos >= ic->i_recv_ring.w_nr) { | 368 | if (pos >= ic->i_recv_ring.w_nr) { |
237 | printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n", | 369 | printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n", |
238 | pos); | 370 | pos); |
239 | ret = -EINVAL; | ||
240 | break; | 371 | break; |
241 | } | 372 | } |
242 | 373 | ||
243 | recv = &ic->i_recvs[pos]; | 374 | recv = &ic->i_recvs[pos]; |
244 | ret = rds_ib_recv_refill_one(conn, recv, kptr_gfp, page_gfp); | 375 | ret = rds_ib_recv_refill_one(conn, recv, prefill); |
245 | if (ret) { | 376 | if (ret) { |
246 | ret = -1; | ||
247 | break; | 377 | break; |
248 | } | 378 | } |
249 | 379 | ||
250 | /* XXX when can this fail? */ | 380 | /* XXX when can this fail? */ |
251 | ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); | 381 | ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); |
252 | rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv, | 382 | rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv, |
253 | recv->r_ibinc, recv->r_frag->f_page, | 383 | recv->r_ibinc, sg_page(&recv->r_frag->f_sg), |
254 | (long) recv->r_frag->f_mapped, ret); | 384 | (long) sg_dma_address(&recv->r_frag->f_sg), ret); |
255 | if (ret) { | 385 | if (ret) { |
256 | rds_ib_conn_error(conn, "recv post on " | 386 | rds_ib_conn_error(conn, "recv post on " |
257 | "%pI4 returned %d, disconnecting and " | 387 | "%pI4 returned %d, disconnecting and " |
258 | "reconnecting\n", &conn->c_faddr, | 388 | "reconnecting\n", &conn->c_faddr, |
259 | ret); | 389 | ret); |
260 | ret = -1; | ||
261 | break; | 390 | break; |
262 | } | 391 | } |
263 | 392 | ||
@@ -270,37 +399,73 @@ int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, | |||
270 | 399 | ||
271 | if (ret) | 400 | if (ret) |
272 | rds_ib_ring_unalloc(&ic->i_recv_ring, 1); | 401 | rds_ib_ring_unalloc(&ic->i_recv_ring, 1); |
273 | return ret; | ||
274 | } | 402 | } |
275 | 403 | ||
276 | void rds_ib_inc_purge(struct rds_incoming *inc) | 404 | /* |
405 | * We want to recycle several types of recv allocations, like incs and frags. | ||
406 | * To use this, the *_free() function passes in the ptr to a list_head within | ||
407 | * the recyclee, as well as the cache to put it on. | ||
408 | * | ||
409 | * First, we put the memory on a percpu list. When this reaches a certain size, | ||
410 | * We move it to an intermediate non-percpu list in a lockless manner, with some | ||
411 | * xchg/compxchg wizardry. | ||
412 | * | ||
413 | * N.B. Instead of a list_head as the anchor, we use a single pointer, which can | ||
414 | * be NULL and xchg'd. The list is actually empty when the pointer is NULL, and | ||
415 | * list_empty() will return true with one element is actually present. | ||
416 | */ | ||
417 | static void rds_ib_recv_cache_put(struct list_head *new_item, | ||
418 | struct rds_ib_refill_cache *cache) | ||
277 | { | 419 | { |
278 | struct rds_ib_incoming *ibinc; | 420 | unsigned long flags; |
279 | struct rds_page_frag *frag; | 421 | struct rds_ib_cache_head *chp; |
280 | struct rds_page_frag *pos; | 422 | struct list_head *old; |
281 | 423 | ||
282 | ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); | 424 | local_irq_save(flags); |
283 | rdsdebug("purging ibinc %p inc %p\n", ibinc, inc); | ||
284 | 425 | ||
285 | list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) { | 426 | chp = per_cpu_ptr(cache->percpu, smp_processor_id()); |
286 | list_del_init(&frag->f_item); | 427 | if (!chp->first) |
287 | rds_ib_frag_drop_page(frag); | 428 | INIT_LIST_HEAD(new_item); |
288 | rds_ib_frag_free(frag); | 429 | else /* put on front */ |
289 | } | 430 | list_add_tail(new_item, chp->first); |
431 | chp->first = new_item; | ||
432 | chp->count++; | ||
433 | |||
434 | if (chp->count < RDS_IB_RECYCLE_BATCH_COUNT) | ||
435 | goto end; | ||
436 | |||
437 | /* | ||
438 | * Return our per-cpu first list to the cache's xfer by atomically | ||
439 | * grabbing the current xfer list, appending it to our per-cpu list, | ||
440 | * and then atomically returning that entire list back to the | ||
441 | * cache's xfer list as long as it's still empty. | ||
442 | */ | ||
443 | do { | ||
444 | old = xchg(&cache->xfer, NULL); | ||
445 | if (old) | ||
446 | list_splice_entire_tail(old, chp->first); | ||
447 | old = cmpxchg(&cache->xfer, NULL, chp->first); | ||
448 | } while (old); | ||
449 | |||
450 | chp->first = NULL; | ||
451 | chp->count = 0; | ||
452 | end: | ||
453 | local_irq_restore(flags); | ||
290 | } | 454 | } |
291 | 455 | ||
292 | void rds_ib_inc_free(struct rds_incoming *inc) | 456 | static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache) |
293 | { | 457 | { |
294 | struct rds_ib_incoming *ibinc; | 458 | struct list_head *head = cache->ready; |
295 | 459 | ||
296 | ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); | 460 | if (head) { |
461 | if (!list_empty(head)) { | ||
462 | cache->ready = head->next; | ||
463 | list_del_init(head); | ||
464 | } else | ||
465 | cache->ready = NULL; | ||
466 | } | ||
297 | 467 | ||
298 | rds_ib_inc_purge(inc); | 468 | return head; |
299 | rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc); | ||
300 | BUG_ON(!list_empty(&ibinc->ii_frags)); | ||
301 | kmem_cache_free(rds_ib_incoming_slab, ibinc); | ||
302 | atomic_dec(&rds_ib_allocation); | ||
303 | BUG_ON(atomic_read(&rds_ib_allocation) < 0); | ||
304 | } | 469 | } |
305 | 470 | ||
306 | int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, | 471 | int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, |
@@ -336,13 +501,13 @@ int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, | |||
336 | to_copy = min_t(unsigned long, to_copy, len - copied); | 501 | to_copy = min_t(unsigned long, to_copy, len - copied); |
337 | 502 | ||
338 | rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag " | 503 | rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag " |
339 | "[%p, %lu] + %lu\n", | 504 | "[%p, %u] + %lu\n", |
340 | to_copy, iov->iov_base, iov->iov_len, iov_off, | 505 | to_copy, iov->iov_base, iov->iov_len, iov_off, |
341 | frag->f_page, frag->f_offset, frag_off); | 506 | sg_page(&frag->f_sg), frag->f_sg.offset, frag_off); |
342 | 507 | ||
343 | /* XXX needs + offset for multiple recvs per page */ | 508 | /* XXX needs + offset for multiple recvs per page */ |
344 | ret = rds_page_copy_to_user(frag->f_page, | 509 | ret = rds_page_copy_to_user(sg_page(&frag->f_sg), |
345 | frag->f_offset + frag_off, | 510 | frag->f_sg.offset + frag_off, |
346 | iov->iov_base + iov_off, | 511 | iov->iov_base + iov_off, |
347 | to_copy); | 512 | to_copy); |
348 | if (ret) { | 513 | if (ret) { |
@@ -557,47 +722,6 @@ u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic) | |||
557 | return rds_ib_get_ack(ic); | 722 | return rds_ib_get_ack(ic); |
558 | } | 723 | } |
559 | 724 | ||
560 | static struct rds_header *rds_ib_get_header(struct rds_connection *conn, | ||
561 | struct rds_ib_recv_work *recv, | ||
562 | u32 data_len) | ||
563 | { | ||
564 | struct rds_ib_connection *ic = conn->c_transport_data; | ||
565 | void *hdr_buff = &ic->i_recv_hdrs[recv - ic->i_recvs]; | ||
566 | void *addr; | ||
567 | u32 misplaced_hdr_bytes; | ||
568 | |||
569 | /* | ||
570 | * Support header at the front (RDS 3.1+) as well as header-at-end. | ||
571 | * | ||
572 | * Cases: | ||
573 | * 1) header all in header buff (great!) | ||
574 | * 2) header all in data page (copy all to header buff) | ||
575 | * 3) header split across hdr buf + data page | ||
576 | * (move bit in hdr buff to end before copying other bit from data page) | ||
577 | */ | ||
578 | if (conn->c_version > RDS_PROTOCOL_3_0 || data_len == RDS_FRAG_SIZE) | ||
579 | return hdr_buff; | ||
580 | |||
581 | if (data_len <= (RDS_FRAG_SIZE - sizeof(struct rds_header))) { | ||
582 | addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0); | ||
583 | memcpy(hdr_buff, | ||
584 | addr + recv->r_frag->f_offset + data_len, | ||
585 | sizeof(struct rds_header)); | ||
586 | kunmap_atomic(addr, KM_SOFTIRQ0); | ||
587 | return hdr_buff; | ||
588 | } | ||
589 | |||
590 | misplaced_hdr_bytes = (sizeof(struct rds_header) - (RDS_FRAG_SIZE - data_len)); | ||
591 | |||
592 | memmove(hdr_buff + misplaced_hdr_bytes, hdr_buff, misplaced_hdr_bytes); | ||
593 | |||
594 | addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0); | ||
595 | memcpy(hdr_buff, addr + recv->r_frag->f_offset + data_len, | ||
596 | sizeof(struct rds_header) - misplaced_hdr_bytes); | ||
597 | kunmap_atomic(addr, KM_SOFTIRQ0); | ||
598 | return hdr_buff; | ||
599 | } | ||
600 | |||
601 | /* | 725 | /* |
602 | * It's kind of lame that we're copying from the posted receive pages into | 726 | * It's kind of lame that we're copying from the posted receive pages into |
603 | * long-lived bitmaps. We could have posted the bitmaps and rdma written into | 727 | * long-lived bitmaps. We could have posted the bitmaps and rdma written into |
@@ -639,7 +763,7 @@ static void rds_ib_cong_recv(struct rds_connection *conn, | |||
639 | to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); | 763 | to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); |
640 | BUG_ON(to_copy & 7); /* Must be 64bit aligned. */ | 764 | BUG_ON(to_copy & 7); /* Must be 64bit aligned. */ |
641 | 765 | ||
642 | addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0); | 766 | addr = kmap_atomic(sg_page(&frag->f_sg), KM_SOFTIRQ0); |
643 | 767 | ||
644 | src = addr + frag_off; | 768 | src = addr + frag_off; |
645 | dst = (void *)map->m_page_addrs[map_page] + map_off; | 769 | dst = (void *)map->m_page_addrs[map_page] + map_off; |
@@ -710,7 +834,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, | |||
710 | } | 834 | } |
711 | data_len -= sizeof(struct rds_header); | 835 | data_len -= sizeof(struct rds_header); |
712 | 836 | ||
713 | ihdr = rds_ib_get_header(conn, recv, data_len); | 837 | ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs]; |
714 | 838 | ||
715 | /* Validate the checksum. */ | 839 | /* Validate the checksum. */ |
716 | if (!rds_message_verify_checksum(ihdr)) { | 840 | if (!rds_message_verify_checksum(ihdr)) { |
@@ -742,12 +866,12 @@ static void rds_ib_process_recv(struct rds_connection *conn, | |||
742 | * the inc is freed. We don't go that route, so we have to drop the | 866 | * the inc is freed. We don't go that route, so we have to drop the |
743 | * page ref ourselves. We can't just leave the page on the recv | 867 | * page ref ourselves. We can't just leave the page on the recv |
744 | * because that confuses the dma mapping of pages and each recv's use | 868 | * because that confuses the dma mapping of pages and each recv's use |
745 | * of a partial page. We can leave the frag, though, it will be | 869 | * of a partial page. |
746 | * reused. | ||
747 | * | 870 | * |
748 | * FIXME: Fold this into the code path below. | 871 | * FIXME: Fold this into the code path below. |
749 | */ | 872 | */ |
750 | rds_ib_frag_drop_page(recv->r_frag); | 873 | rds_ib_frag_free(ic, recv->r_frag); |
874 | recv->r_frag = NULL; | ||
751 | return; | 875 | return; |
752 | } | 876 | } |
753 | 877 | ||
@@ -757,7 +881,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, | |||
757 | * into the inc and save the inc so we can hang upcoming fragments | 881 | * into the inc and save the inc so we can hang upcoming fragments |
758 | * off its list. | 882 | * off its list. |
759 | */ | 883 | */ |
760 | if (ibinc == NULL) { | 884 | if (!ibinc) { |
761 | ibinc = recv->r_ibinc; | 885 | ibinc = recv->r_ibinc; |
762 | recv->r_ibinc = NULL; | 886 | recv->r_ibinc = NULL; |
763 | ic->i_ibinc = ibinc; | 887 | ic->i_ibinc = ibinc; |
@@ -842,32 +966,38 @@ static inline void rds_poll_cq(struct rds_ib_connection *ic, | |||
842 | struct rds_ib_recv_work *recv; | 966 | struct rds_ib_recv_work *recv; |
843 | 967 | ||
844 | while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) { | 968 | while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) { |
845 | rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", | 969 | rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", |
846 | (unsigned long long)wc.wr_id, wc.status, wc.byte_len, | 970 | (unsigned long long)wc.wr_id, wc.status, |
971 | rds_ib_wc_status_str(wc.status), wc.byte_len, | ||
847 | be32_to_cpu(wc.ex.imm_data)); | 972 | be32_to_cpu(wc.ex.imm_data)); |
848 | rds_ib_stats_inc(s_ib_rx_cq_event); | 973 | rds_ib_stats_inc(s_ib_rx_cq_event); |
849 | 974 | ||
850 | recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)]; | 975 | recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)]; |
851 | 976 | ||
852 | rds_ib_recv_unmap_page(ic, recv); | 977 | ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE); |
853 | 978 | ||
854 | /* | 979 | /* |
855 | * Also process recvs in connecting state because it is possible | 980 | * Also process recvs in connecting state because it is possible |
856 | * to get a recv completion _before_ the rdmacm ESTABLISHED | 981 | * to get a recv completion _before_ the rdmacm ESTABLISHED |
857 | * event is processed. | 982 | * event is processed. |
858 | */ | 983 | */ |
859 | if (rds_conn_up(conn) || rds_conn_connecting(conn)) { | 984 | if (wc.status == IB_WC_SUCCESS) { |
985 | rds_ib_process_recv(conn, recv, wc.byte_len, state); | ||
986 | } else { | ||
860 | /* We expect errors as the qp is drained during shutdown */ | 987 | /* We expect errors as the qp is drained during shutdown */ |
861 | if (wc.status == IB_WC_SUCCESS) { | 988 | if (rds_conn_up(conn) || rds_conn_connecting(conn)) |
862 | rds_ib_process_recv(conn, recv, wc.byte_len, state); | 989 | rds_ib_conn_error(conn, "recv completion on %pI4 had " |
863 | } else { | 990 | "status %u (%s), disconnecting and " |
864 | rds_ib_conn_error(conn, "recv completion on " | 991 | "reconnecting\n", &conn->c_faddr, |
865 | "%pI4 had status %u, disconnecting and " | 992 | wc.status, |
866 | "reconnecting\n", &conn->c_faddr, | 993 | rds_ib_wc_status_str(wc.status)); |
867 | wc.status); | ||
868 | } | ||
869 | } | 994 | } |
870 | 995 | ||
996 | /* | ||
997 | * It's very important that we only free this ring entry if we've truly | ||
998 | * freed the resources allocated to the entry. The refilling path can | ||
999 | * leak if we don't. | ||
1000 | */ | ||
871 | rds_ib_ring_free(&ic->i_recv_ring, 1); | 1001 | rds_ib_ring_free(&ic->i_recv_ring, 1); |
872 | } | 1002 | } |
873 | } | 1003 | } |
@@ -897,11 +1027,8 @@ void rds_ib_recv_tasklet_fn(unsigned long data) | |||
897 | if (rds_ib_ring_empty(&ic->i_recv_ring)) | 1027 | if (rds_ib_ring_empty(&ic->i_recv_ring)) |
898 | rds_ib_stats_inc(s_ib_rx_ring_empty); | 1028 | rds_ib_stats_inc(s_ib_rx_ring_empty); |
899 | 1029 | ||
900 | /* | ||
901 | * If the ring is running low, then schedule the thread to refill. | ||
902 | */ | ||
903 | if (rds_ib_ring_low(&ic->i_recv_ring)) | 1030 | if (rds_ib_ring_low(&ic->i_recv_ring)) |
904 | queue_delayed_work(rds_wq, &conn->c_recv_w, 0); | 1031 | rds_ib_recv_refill(conn, 0); |
905 | } | 1032 | } |
906 | 1033 | ||
907 | int rds_ib_recv(struct rds_connection *conn) | 1034 | int rds_ib_recv(struct rds_connection *conn) |
@@ -910,25 +1037,13 @@ int rds_ib_recv(struct rds_connection *conn) | |||
910 | int ret = 0; | 1037 | int ret = 0; |
911 | 1038 | ||
912 | rdsdebug("conn %p\n", conn); | 1039 | rdsdebug("conn %p\n", conn); |
913 | |||
914 | /* | ||
915 | * If we get a temporary posting failure in this context then | ||
916 | * we're really low and we want the caller to back off for a bit. | ||
917 | */ | ||
918 | mutex_lock(&ic->i_recv_mutex); | ||
919 | if (rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0)) | ||
920 | ret = -ENOMEM; | ||
921 | else | ||
922 | rds_ib_stats_inc(s_ib_rx_refill_from_thread); | ||
923 | mutex_unlock(&ic->i_recv_mutex); | ||
924 | |||
925 | if (rds_conn_up(conn)) | 1040 | if (rds_conn_up(conn)) |
926 | rds_ib_attempt_ack(ic); | 1041 | rds_ib_attempt_ack(ic); |
927 | 1042 | ||
928 | return ret; | 1043 | return ret; |
929 | } | 1044 | } |
930 | 1045 | ||
931 | int __init rds_ib_recv_init(void) | 1046 | int rds_ib_recv_init(void) |
932 | { | 1047 | { |
933 | struct sysinfo si; | 1048 | struct sysinfo si; |
934 | int ret = -ENOMEM; | 1049 | int ret = -ENOMEM; |
@@ -939,14 +1054,14 @@ int __init rds_ib_recv_init(void) | |||
939 | 1054 | ||
940 | rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming", | 1055 | rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming", |
941 | sizeof(struct rds_ib_incoming), | 1056 | sizeof(struct rds_ib_incoming), |
942 | 0, 0, NULL); | 1057 | 0, SLAB_HWCACHE_ALIGN, NULL); |
943 | if (rds_ib_incoming_slab == NULL) | 1058 | if (!rds_ib_incoming_slab) |
944 | goto out; | 1059 | goto out; |
945 | 1060 | ||
946 | rds_ib_frag_slab = kmem_cache_create("rds_ib_frag", | 1061 | rds_ib_frag_slab = kmem_cache_create("rds_ib_frag", |
947 | sizeof(struct rds_page_frag), | 1062 | sizeof(struct rds_page_frag), |
948 | 0, 0, NULL); | 1063 | 0, SLAB_HWCACHE_ALIGN, NULL); |
949 | if (rds_ib_frag_slab == NULL) | 1064 | if (!rds_ib_frag_slab) |
950 | kmem_cache_destroy(rds_ib_incoming_slab); | 1065 | kmem_cache_destroy(rds_ib_incoming_slab); |
951 | else | 1066 | else |
952 | ret = 0; | 1067 | ret = 0; |