aboutsummaryrefslogtreecommitdiffstats
path: root/net/rds/ib_recv.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-10-23 14:47:02 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-10-23 14:47:02 -0400
commit5f05647dd81c11a6a165ccc8f0c1370b16f3bcb0 (patch)
tree7851ef1c93aa1aba7ef327ca4b75fd35e6d10f29 /net/rds/ib_recv.c
parent02f36038c568111ad4fc433f6fa760ff5e38fab4 (diff)
parentec37a48d1d16c30b655ac5280209edf52a6775d4 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next-2.6
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next-2.6: (1699 commits) bnx2/bnx2x: Unsupported Ethtool operations should return -EINVAL. vlan: Calling vlan_hwaccel_do_receive() is always valid. tproxy: use the interface primary IP address as a default value for --on-ip tproxy: added IPv6 support to the socket match cxgb3: function namespace cleanup tproxy: added IPv6 support to the TPROXY target tproxy: added IPv6 socket lookup function to nf_tproxy_core be2net: Changes to use only priority codes allowed by f/w tproxy: allow non-local binds of IPv6 sockets if IP_TRANSPARENT is enabled tproxy: added tproxy sockopt interface in the IPV6 layer tproxy: added udp6_lib_lookup function tproxy: added const specifiers to udp lookup functions tproxy: split off ipv6 defragmentation to a separate module l2tp: small cleanup nf_nat: restrict ICMP translation for embedded header can: mcp251x: fix generation of error frames can: mcp251x: fix endless loop in interrupt handler if CANINTF_MERRF is set can-raw: add msg_flags to distinguish local traffic 9p: client code cleanup rds: make local functions/variables static ... Fix up conflicts in net/core/dev.c, drivers/net/pcmcia/smc91c92_cs.c and drivers/net/wireless/ath/ath9k/debug.c as per David
Diffstat (limited to 'net/rds/ib_recv.c')
-rw-r--r--net/rds/ib_recv.c549
1 files changed, 332 insertions, 217 deletions
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index c74e9904a6b2..e29e0ca32f74 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -43,42 +43,6 @@ static struct kmem_cache *rds_ib_incoming_slab;
43static struct kmem_cache *rds_ib_frag_slab; 43static struct kmem_cache *rds_ib_frag_slab;
44static atomic_t rds_ib_allocation = ATOMIC_INIT(0); 44static atomic_t rds_ib_allocation = ATOMIC_INIT(0);
45 45
46static void rds_ib_frag_drop_page(struct rds_page_frag *frag)
47{
48 rdsdebug("frag %p page %p\n", frag, frag->f_page);
49 __free_page(frag->f_page);
50 frag->f_page = NULL;
51}
52
53static void rds_ib_frag_free(struct rds_page_frag *frag)
54{
55 rdsdebug("frag %p page %p\n", frag, frag->f_page);
56 BUG_ON(frag->f_page != NULL);
57 kmem_cache_free(rds_ib_frag_slab, frag);
58}
59
60/*
61 * We map a page at a time. Its fragments are posted in order. This
62 * is called in fragment order as the fragments get send completion events.
63 * Only the last frag in the page performs the unmapping.
64 *
65 * It's OK for ring cleanup to call this in whatever order it likes because
66 * DMA is not in flight and so we can unmap while other ring entries still
67 * hold page references in their frags.
68 */
69static void rds_ib_recv_unmap_page(struct rds_ib_connection *ic,
70 struct rds_ib_recv_work *recv)
71{
72 struct rds_page_frag *frag = recv->r_frag;
73
74 rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page);
75 if (frag->f_mapped)
76 ib_dma_unmap_page(ic->i_cm_id->device,
77 frag->f_mapped,
78 RDS_FRAG_SIZE, DMA_FROM_DEVICE);
79 frag->f_mapped = 0;
80}
81
82void rds_ib_recv_init_ring(struct rds_ib_connection *ic) 46void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
83{ 47{
84 struct rds_ib_recv_work *recv; 48 struct rds_ib_recv_work *recv;
@@ -95,16 +59,161 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
95 recv->r_wr.sg_list = recv->r_sge; 59 recv->r_wr.sg_list = recv->r_sge;
96 recv->r_wr.num_sge = RDS_IB_RECV_SGE; 60 recv->r_wr.num_sge = RDS_IB_RECV_SGE;
97 61
98 sge = rds_ib_data_sge(ic, recv->r_sge); 62 sge = &recv->r_sge[0];
63 sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
64 sge->length = sizeof(struct rds_header);
65 sge->lkey = ic->i_mr->lkey;
66
67 sge = &recv->r_sge[1];
99 sge->addr = 0; 68 sge->addr = 0;
100 sge->length = RDS_FRAG_SIZE; 69 sge->length = RDS_FRAG_SIZE;
101 sge->lkey = ic->i_mr->lkey; 70 sge->lkey = ic->i_mr->lkey;
71 }
72}
102 73
103 sge = rds_ib_header_sge(ic, recv->r_sge); 74/*
104 sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); 75 * The entire 'from' list, including the from element itself, is put on
105 sge->length = sizeof(struct rds_header); 76 * to the tail of the 'to' list.
106 sge->lkey = ic->i_mr->lkey; 77 */
78static void list_splice_entire_tail(struct list_head *from,
79 struct list_head *to)
80{
81 struct list_head *from_last = from->prev;
82
83 list_splice_tail(from_last, to);
84 list_add_tail(from_last, to);
85}
86
87static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache)
88{
89 struct list_head *tmp;
90
91 tmp = xchg(&cache->xfer, NULL);
92 if (tmp) {
93 if (cache->ready)
94 list_splice_entire_tail(tmp, cache->ready);
95 else
96 cache->ready = tmp;
97 }
98}
99
100static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache)
101{
102 struct rds_ib_cache_head *head;
103 int cpu;
104
105 cache->percpu = alloc_percpu(struct rds_ib_cache_head);
106 if (!cache->percpu)
107 return -ENOMEM;
108
109 for_each_possible_cpu(cpu) {
110 head = per_cpu_ptr(cache->percpu, cpu);
111 head->first = NULL;
112 head->count = 0;
113 }
114 cache->xfer = NULL;
115 cache->ready = NULL;
116
117 return 0;
118}
119
120int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic)
121{
122 int ret;
123
124 ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs);
125 if (!ret) {
126 ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags);
127 if (ret)
128 free_percpu(ic->i_cache_incs.percpu);
107 } 129 }
130
131 return ret;
132}
133
134static void rds_ib_cache_splice_all_lists(struct rds_ib_refill_cache *cache,
135 struct list_head *caller_list)
136{
137 struct rds_ib_cache_head *head;
138 int cpu;
139
140 for_each_possible_cpu(cpu) {
141 head = per_cpu_ptr(cache->percpu, cpu);
142 if (head->first) {
143 list_splice_entire_tail(head->first, caller_list);
144 head->first = NULL;
145 }
146 }
147
148 if (cache->ready) {
149 list_splice_entire_tail(cache->ready, caller_list);
150 cache->ready = NULL;
151 }
152}
153
154void rds_ib_recv_free_caches(struct rds_ib_connection *ic)
155{
156 struct rds_ib_incoming *inc;
157 struct rds_ib_incoming *inc_tmp;
158 struct rds_page_frag *frag;
159 struct rds_page_frag *frag_tmp;
160 LIST_HEAD(list);
161
162 rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
163 rds_ib_cache_splice_all_lists(&ic->i_cache_incs, &list);
164 free_percpu(ic->i_cache_incs.percpu);
165
166 list_for_each_entry_safe(inc, inc_tmp, &list, ii_cache_entry) {
167 list_del(&inc->ii_cache_entry);
168 WARN_ON(!list_empty(&inc->ii_frags));
169 kmem_cache_free(rds_ib_incoming_slab, inc);
170 }
171
172 rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
173 rds_ib_cache_splice_all_lists(&ic->i_cache_frags, &list);
174 free_percpu(ic->i_cache_frags.percpu);
175
176 list_for_each_entry_safe(frag, frag_tmp, &list, f_cache_entry) {
177 list_del(&frag->f_cache_entry);
178 WARN_ON(!list_empty(&frag->f_item));
179 kmem_cache_free(rds_ib_frag_slab, frag);
180 }
181}
182
183/* fwd decl */
184static void rds_ib_recv_cache_put(struct list_head *new_item,
185 struct rds_ib_refill_cache *cache);
186static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache);
187
188
189/* Recycle frag and attached recv buffer f_sg */
190static void rds_ib_frag_free(struct rds_ib_connection *ic,
191 struct rds_page_frag *frag)
192{
193 rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg));
194
195 rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags);
196}
197
198/* Recycle inc after freeing attached frags */
199void rds_ib_inc_free(struct rds_incoming *inc)
200{
201 struct rds_ib_incoming *ibinc;
202 struct rds_page_frag *frag;
203 struct rds_page_frag *pos;
204 struct rds_ib_connection *ic = inc->i_conn->c_transport_data;
205
206 ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
207
208 /* Free attached frags */
209 list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
210 list_del_init(&frag->f_item);
211 rds_ib_frag_free(ic, frag);
212 }
213 BUG_ON(!list_empty(&ibinc->ii_frags));
214
215 rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
216 rds_ib_recv_cache_put(&ibinc->ii_cache_entry, &ic->i_cache_incs);
108} 217}
109 218
110static void rds_ib_recv_clear_one(struct rds_ib_connection *ic, 219static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
@@ -115,10 +224,8 @@ static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
115 recv->r_ibinc = NULL; 224 recv->r_ibinc = NULL;
116 } 225 }
117 if (recv->r_frag) { 226 if (recv->r_frag) {
118 rds_ib_recv_unmap_page(ic, recv); 227 ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
119 if (recv->r_frag->f_page) 228 rds_ib_frag_free(ic, recv->r_frag);
120 rds_ib_frag_drop_page(recv->r_frag);
121 rds_ib_frag_free(recv->r_frag);
122 recv->r_frag = NULL; 229 recv->r_frag = NULL;
123 } 230 }
124} 231}
@@ -129,84 +236,111 @@ void rds_ib_recv_clear_ring(struct rds_ib_connection *ic)
129 236
130 for (i = 0; i < ic->i_recv_ring.w_nr; i++) 237 for (i = 0; i < ic->i_recv_ring.w_nr; i++)
131 rds_ib_recv_clear_one(ic, &ic->i_recvs[i]); 238 rds_ib_recv_clear_one(ic, &ic->i_recvs[i]);
132
133 if (ic->i_frag.f_page)
134 rds_ib_frag_drop_page(&ic->i_frag);
135} 239}
136 240
137static int rds_ib_recv_refill_one(struct rds_connection *conn, 241static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *ic,
138 struct rds_ib_recv_work *recv, 242 gfp_t slab_mask)
139 gfp_t kptr_gfp, gfp_t page_gfp)
140{ 243{
141 struct rds_ib_connection *ic = conn->c_transport_data; 244 struct rds_ib_incoming *ibinc;
142 dma_addr_t dma_addr; 245 struct list_head *cache_item;
143 struct ib_sge *sge; 246 int avail_allocs;
144 int ret = -ENOMEM;
145 247
146 if (recv->r_ibinc == NULL) { 248 cache_item = rds_ib_recv_cache_get(&ic->i_cache_incs);
147 if (!atomic_add_unless(&rds_ib_allocation, 1, rds_ib_sysctl_max_recv_allocation)) { 249 if (cache_item) {
250 ibinc = container_of(cache_item, struct rds_ib_incoming, ii_cache_entry);
251 } else {
252 avail_allocs = atomic_add_unless(&rds_ib_allocation,
253 1, rds_ib_sysctl_max_recv_allocation);
254 if (!avail_allocs) {
148 rds_ib_stats_inc(s_ib_rx_alloc_limit); 255 rds_ib_stats_inc(s_ib_rx_alloc_limit);
149 goto out; 256 return NULL;
150 } 257 }
151 recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab, 258 ibinc = kmem_cache_alloc(rds_ib_incoming_slab, slab_mask);
152 kptr_gfp); 259 if (!ibinc) {
153 if (recv->r_ibinc == NULL) {
154 atomic_dec(&rds_ib_allocation); 260 atomic_dec(&rds_ib_allocation);
155 goto out; 261 return NULL;
156 } 262 }
157 INIT_LIST_HEAD(&recv->r_ibinc->ii_frags);
158 rds_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr);
159 } 263 }
264 INIT_LIST_HEAD(&ibinc->ii_frags);
265 rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr);
160 266
161 if (recv->r_frag == NULL) { 267 return ibinc;
162 recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, kptr_gfp); 268}
163 if (recv->r_frag == NULL) 269
164 goto out; 270static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic,
165 INIT_LIST_HEAD(&recv->r_frag->f_item); 271 gfp_t slab_mask, gfp_t page_mask)
166 recv->r_frag->f_page = NULL; 272{
273 struct rds_page_frag *frag;
274 struct list_head *cache_item;
275 int ret;
276
277 cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags);
278 if (cache_item) {
279 frag = container_of(cache_item, struct rds_page_frag, f_cache_entry);
280 } else {
281 frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask);
282 if (!frag)
283 return NULL;
284
285 sg_init_table(&frag->f_sg, 1);
286 ret = rds_page_remainder_alloc(&frag->f_sg,
287 RDS_FRAG_SIZE, page_mask);
288 if (ret) {
289 kmem_cache_free(rds_ib_frag_slab, frag);
290 return NULL;
291 }
167 } 292 }
168 293
169 if (ic->i_frag.f_page == NULL) { 294 INIT_LIST_HEAD(&frag->f_item);
170 ic->i_frag.f_page = alloc_page(page_gfp); 295
171 if (ic->i_frag.f_page == NULL) 296 return frag;
172 goto out; 297}
173 ic->i_frag.f_offset = 0; 298
299static int rds_ib_recv_refill_one(struct rds_connection *conn,
300 struct rds_ib_recv_work *recv, int prefill)
301{
302 struct rds_ib_connection *ic = conn->c_transport_data;
303 struct ib_sge *sge;
304 int ret = -ENOMEM;
305 gfp_t slab_mask = GFP_NOWAIT;
306 gfp_t page_mask = GFP_NOWAIT;
307
308 if (prefill) {
309 slab_mask = GFP_KERNEL;
310 page_mask = GFP_HIGHUSER;
174 } 311 }
175 312
176 dma_addr = ib_dma_map_page(ic->i_cm_id->device, 313 if (!ic->i_cache_incs.ready)
177 ic->i_frag.f_page, 314 rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
178 ic->i_frag.f_offset, 315 if (!ic->i_cache_frags.ready)
179 RDS_FRAG_SIZE, 316 rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
180 DMA_FROM_DEVICE);
181 if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr))
182 goto out;
183 317
184 /* 318 /*
185 * Once we get the RDS_PAGE_LAST_OFF frag then rds_ib_frag_unmap() 319 * ibinc was taken from recv if recv contained the start of a message.
186 * must be called on this recv. This happens as completions hit 320 * recvs that were continuations will still have this allocated.
187 * in order or on connection shutdown.
188 */ 321 */
189 recv->r_frag->f_page = ic->i_frag.f_page; 322 if (!recv->r_ibinc) {
190 recv->r_frag->f_offset = ic->i_frag.f_offset; 323 recv->r_ibinc = rds_ib_refill_one_inc(ic, slab_mask);
191 recv->r_frag->f_mapped = dma_addr; 324 if (!recv->r_ibinc)
325 goto out;
326 }
192 327
193 sge = rds_ib_data_sge(ic, recv->r_sge); 328 WARN_ON(recv->r_frag); /* leak! */
194 sge->addr = dma_addr; 329 recv->r_frag = rds_ib_refill_one_frag(ic, slab_mask, page_mask);
195 sge->length = RDS_FRAG_SIZE; 330 if (!recv->r_frag)
331 goto out;
332
333 ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg,
334 1, DMA_FROM_DEVICE);
335 WARN_ON(ret != 1);
196 336
197 sge = rds_ib_header_sge(ic, recv->r_sge); 337 sge = &recv->r_sge[0];
198 sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); 338 sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
199 sge->length = sizeof(struct rds_header); 339 sge->length = sizeof(struct rds_header);
200 340
201 get_page(recv->r_frag->f_page); 341 sge = &recv->r_sge[1];
202 342 sge->addr = sg_dma_address(&recv->r_frag->f_sg);
203 if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) { 343 sge->length = sg_dma_len(&recv->r_frag->f_sg);
204 ic->i_frag.f_offset += RDS_FRAG_SIZE;
205 } else {
206 put_page(ic->i_frag.f_page);
207 ic->i_frag.f_page = NULL;
208 ic->i_frag.f_offset = 0;
209 }
210 344
211 ret = 0; 345 ret = 0;
212out: 346out:
@@ -216,13 +350,11 @@ out:
216/* 350/*
217 * This tries to allocate and post unused work requests after making sure that 351 * This tries to allocate and post unused work requests after making sure that
218 * they have all the allocations they need to queue received fragments into 352 * they have all the allocations they need to queue received fragments into
219 * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc 353 * sockets.
220 * pairs don't go unmatched.
221 * 354 *
222 * -1 is returned if posting fails due to temporary resource exhaustion. 355 * -1 is returned if posting fails due to temporary resource exhaustion.
223 */ 356 */
224int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, 357void rds_ib_recv_refill(struct rds_connection *conn, int prefill)
225 gfp_t page_gfp, int prefill)
226{ 358{
227 struct rds_ib_connection *ic = conn->c_transport_data; 359 struct rds_ib_connection *ic = conn->c_transport_data;
228 struct rds_ib_recv_work *recv; 360 struct rds_ib_recv_work *recv;
@@ -236,28 +368,25 @@ int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
236 if (pos >= ic->i_recv_ring.w_nr) { 368 if (pos >= ic->i_recv_ring.w_nr) {
237 printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n", 369 printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
238 pos); 370 pos);
239 ret = -EINVAL;
240 break; 371 break;
241 } 372 }
242 373
243 recv = &ic->i_recvs[pos]; 374 recv = &ic->i_recvs[pos];
244 ret = rds_ib_recv_refill_one(conn, recv, kptr_gfp, page_gfp); 375 ret = rds_ib_recv_refill_one(conn, recv, prefill);
245 if (ret) { 376 if (ret) {
246 ret = -1;
247 break; 377 break;
248 } 378 }
249 379
250 /* XXX when can this fail? */ 380 /* XXX when can this fail? */
251 ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); 381 ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
252 rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv, 382 rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv,
253 recv->r_ibinc, recv->r_frag->f_page, 383 recv->r_ibinc, sg_page(&recv->r_frag->f_sg),
254 (long) recv->r_frag->f_mapped, ret); 384 (long) sg_dma_address(&recv->r_frag->f_sg), ret);
255 if (ret) { 385 if (ret) {
256 rds_ib_conn_error(conn, "recv post on " 386 rds_ib_conn_error(conn, "recv post on "
257 "%pI4 returned %d, disconnecting and " 387 "%pI4 returned %d, disconnecting and "
258 "reconnecting\n", &conn->c_faddr, 388 "reconnecting\n", &conn->c_faddr,
259 ret); 389 ret);
260 ret = -1;
261 break; 390 break;
262 } 391 }
263 392
@@ -270,37 +399,73 @@ int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
270 399
271 if (ret) 400 if (ret)
272 rds_ib_ring_unalloc(&ic->i_recv_ring, 1); 401 rds_ib_ring_unalloc(&ic->i_recv_ring, 1);
273 return ret;
274} 402}
275 403
276void rds_ib_inc_purge(struct rds_incoming *inc) 404/*
405 * We want to recycle several types of recv allocations, like incs and frags.
406 * To use this, the *_free() function passes in the ptr to a list_head within
407 * the recyclee, as well as the cache to put it on.
408 *
409 * First, we put the memory on a percpu list. When this reaches a certain size,
410 * We move it to an intermediate non-percpu list in a lockless manner, with some
411 * xchg/compxchg wizardry.
412 *
413 * N.B. Instead of a list_head as the anchor, we use a single pointer, which can
414 * be NULL and xchg'd. The list is actually empty when the pointer is NULL, and
415 * list_empty() will return true with one element is actually present.
416 */
417static void rds_ib_recv_cache_put(struct list_head *new_item,
418 struct rds_ib_refill_cache *cache)
277{ 419{
278 struct rds_ib_incoming *ibinc; 420 unsigned long flags;
279 struct rds_page_frag *frag; 421 struct rds_ib_cache_head *chp;
280 struct rds_page_frag *pos; 422 struct list_head *old;
281 423
282 ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); 424 local_irq_save(flags);
283 rdsdebug("purging ibinc %p inc %p\n", ibinc, inc);
284 425
285 list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) { 426 chp = per_cpu_ptr(cache->percpu, smp_processor_id());
286 list_del_init(&frag->f_item); 427 if (!chp->first)
287 rds_ib_frag_drop_page(frag); 428 INIT_LIST_HEAD(new_item);
288 rds_ib_frag_free(frag); 429 else /* put on front */
289 } 430 list_add_tail(new_item, chp->first);
431 chp->first = new_item;
432 chp->count++;
433
434 if (chp->count < RDS_IB_RECYCLE_BATCH_COUNT)
435 goto end;
436
437 /*
438 * Return our per-cpu first list to the cache's xfer by atomically
439 * grabbing the current xfer list, appending it to our per-cpu list,
440 * and then atomically returning that entire list back to the
441 * cache's xfer list as long as it's still empty.
442 */
443 do {
444 old = xchg(&cache->xfer, NULL);
445 if (old)
446 list_splice_entire_tail(old, chp->first);
447 old = cmpxchg(&cache->xfer, NULL, chp->first);
448 } while (old);
449
450 chp->first = NULL;
451 chp->count = 0;
452end:
453 local_irq_restore(flags);
290} 454}
291 455
292void rds_ib_inc_free(struct rds_incoming *inc) 456static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache)
293{ 457{
294 struct rds_ib_incoming *ibinc; 458 struct list_head *head = cache->ready;
295 459
296 ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); 460 if (head) {
461 if (!list_empty(head)) {
462 cache->ready = head->next;
463 list_del_init(head);
464 } else
465 cache->ready = NULL;
466 }
297 467
298 rds_ib_inc_purge(inc); 468 return head;
299 rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
300 BUG_ON(!list_empty(&ibinc->ii_frags));
301 kmem_cache_free(rds_ib_incoming_slab, ibinc);
302 atomic_dec(&rds_ib_allocation);
303 BUG_ON(atomic_read(&rds_ib_allocation) < 0);
304} 469}
305 470
306int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, 471int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
@@ -336,13 +501,13 @@ int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
336 to_copy = min_t(unsigned long, to_copy, len - copied); 501 to_copy = min_t(unsigned long, to_copy, len - copied);
337 502
338 rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag " 503 rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag "
339 "[%p, %lu] + %lu\n", 504 "[%p, %u] + %lu\n",
340 to_copy, iov->iov_base, iov->iov_len, iov_off, 505 to_copy, iov->iov_base, iov->iov_len, iov_off,
341 frag->f_page, frag->f_offset, frag_off); 506 sg_page(&frag->f_sg), frag->f_sg.offset, frag_off);
342 507
343 /* XXX needs + offset for multiple recvs per page */ 508 /* XXX needs + offset for multiple recvs per page */
344 ret = rds_page_copy_to_user(frag->f_page, 509 ret = rds_page_copy_to_user(sg_page(&frag->f_sg),
345 frag->f_offset + frag_off, 510 frag->f_sg.offset + frag_off,
346 iov->iov_base + iov_off, 511 iov->iov_base + iov_off,
347 to_copy); 512 to_copy);
348 if (ret) { 513 if (ret) {
@@ -557,47 +722,6 @@ u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic)
557 return rds_ib_get_ack(ic); 722 return rds_ib_get_ack(ic);
558} 723}
559 724
560static struct rds_header *rds_ib_get_header(struct rds_connection *conn,
561 struct rds_ib_recv_work *recv,
562 u32 data_len)
563{
564 struct rds_ib_connection *ic = conn->c_transport_data;
565 void *hdr_buff = &ic->i_recv_hdrs[recv - ic->i_recvs];
566 void *addr;
567 u32 misplaced_hdr_bytes;
568
569 /*
570 * Support header at the front (RDS 3.1+) as well as header-at-end.
571 *
572 * Cases:
573 * 1) header all in header buff (great!)
574 * 2) header all in data page (copy all to header buff)
575 * 3) header split across hdr buf + data page
576 * (move bit in hdr buff to end before copying other bit from data page)
577 */
578 if (conn->c_version > RDS_PROTOCOL_3_0 || data_len == RDS_FRAG_SIZE)
579 return hdr_buff;
580
581 if (data_len <= (RDS_FRAG_SIZE - sizeof(struct rds_header))) {
582 addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0);
583 memcpy(hdr_buff,
584 addr + recv->r_frag->f_offset + data_len,
585 sizeof(struct rds_header));
586 kunmap_atomic(addr, KM_SOFTIRQ0);
587 return hdr_buff;
588 }
589
590 misplaced_hdr_bytes = (sizeof(struct rds_header) - (RDS_FRAG_SIZE - data_len));
591
592 memmove(hdr_buff + misplaced_hdr_bytes, hdr_buff, misplaced_hdr_bytes);
593
594 addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0);
595 memcpy(hdr_buff, addr + recv->r_frag->f_offset + data_len,
596 sizeof(struct rds_header) - misplaced_hdr_bytes);
597 kunmap_atomic(addr, KM_SOFTIRQ0);
598 return hdr_buff;
599}
600
601/* 725/*
602 * It's kind of lame that we're copying from the posted receive pages into 726 * It's kind of lame that we're copying from the posted receive pages into
603 * long-lived bitmaps. We could have posted the bitmaps and rdma written into 727 * long-lived bitmaps. We could have posted the bitmaps and rdma written into
@@ -639,7 +763,7 @@ static void rds_ib_cong_recv(struct rds_connection *conn,
639 to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); 763 to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
640 BUG_ON(to_copy & 7); /* Must be 64bit aligned. */ 764 BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
641 765
642 addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0); 766 addr = kmap_atomic(sg_page(&frag->f_sg), KM_SOFTIRQ0);
643 767
644 src = addr + frag_off; 768 src = addr + frag_off;
645 dst = (void *)map->m_page_addrs[map_page] + map_off; 769 dst = (void *)map->m_page_addrs[map_page] + map_off;
@@ -710,7 +834,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
710 } 834 }
711 data_len -= sizeof(struct rds_header); 835 data_len -= sizeof(struct rds_header);
712 836
713 ihdr = rds_ib_get_header(conn, recv, data_len); 837 ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
714 838
715 /* Validate the checksum. */ 839 /* Validate the checksum. */
716 if (!rds_message_verify_checksum(ihdr)) { 840 if (!rds_message_verify_checksum(ihdr)) {
@@ -742,12 +866,12 @@ static void rds_ib_process_recv(struct rds_connection *conn,
742 * the inc is freed. We don't go that route, so we have to drop the 866 * the inc is freed. We don't go that route, so we have to drop the
743 * page ref ourselves. We can't just leave the page on the recv 867 * page ref ourselves. We can't just leave the page on the recv
744 * because that confuses the dma mapping of pages and each recv's use 868 * because that confuses the dma mapping of pages and each recv's use
745 * of a partial page. We can leave the frag, though, it will be 869 * of a partial page.
746 * reused.
747 * 870 *
748 * FIXME: Fold this into the code path below. 871 * FIXME: Fold this into the code path below.
749 */ 872 */
750 rds_ib_frag_drop_page(recv->r_frag); 873 rds_ib_frag_free(ic, recv->r_frag);
874 recv->r_frag = NULL;
751 return; 875 return;
752 } 876 }
753 877
@@ -757,7 +881,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
757 * into the inc and save the inc so we can hang upcoming fragments 881 * into the inc and save the inc so we can hang upcoming fragments
758 * off its list. 882 * off its list.
759 */ 883 */
760 if (ibinc == NULL) { 884 if (!ibinc) {
761 ibinc = recv->r_ibinc; 885 ibinc = recv->r_ibinc;
762 recv->r_ibinc = NULL; 886 recv->r_ibinc = NULL;
763 ic->i_ibinc = ibinc; 887 ic->i_ibinc = ibinc;
@@ -842,32 +966,38 @@ static inline void rds_poll_cq(struct rds_ib_connection *ic,
842 struct rds_ib_recv_work *recv; 966 struct rds_ib_recv_work *recv;
843 967
844 while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) { 968 while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) {
845 rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", 969 rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
846 (unsigned long long)wc.wr_id, wc.status, wc.byte_len, 970 (unsigned long long)wc.wr_id, wc.status,
971 rds_ib_wc_status_str(wc.status), wc.byte_len,
847 be32_to_cpu(wc.ex.imm_data)); 972 be32_to_cpu(wc.ex.imm_data));
848 rds_ib_stats_inc(s_ib_rx_cq_event); 973 rds_ib_stats_inc(s_ib_rx_cq_event);
849 974
850 recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)]; 975 recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
851 976
852 rds_ib_recv_unmap_page(ic, recv); 977 ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
853 978
854 /* 979 /*
855 * Also process recvs in connecting state because it is possible 980 * Also process recvs in connecting state because it is possible
856 * to get a recv completion _before_ the rdmacm ESTABLISHED 981 * to get a recv completion _before_ the rdmacm ESTABLISHED
857 * event is processed. 982 * event is processed.
858 */ 983 */
859 if (rds_conn_up(conn) || rds_conn_connecting(conn)) { 984 if (wc.status == IB_WC_SUCCESS) {
985 rds_ib_process_recv(conn, recv, wc.byte_len, state);
986 } else {
860 /* We expect errors as the qp is drained during shutdown */ 987 /* We expect errors as the qp is drained during shutdown */
861 if (wc.status == IB_WC_SUCCESS) { 988 if (rds_conn_up(conn) || rds_conn_connecting(conn))
862 rds_ib_process_recv(conn, recv, wc.byte_len, state); 989 rds_ib_conn_error(conn, "recv completion on %pI4 had "
863 } else { 990 "status %u (%s), disconnecting and "
864 rds_ib_conn_error(conn, "recv completion on " 991 "reconnecting\n", &conn->c_faddr,
865 "%pI4 had status %u, disconnecting and " 992 wc.status,
866 "reconnecting\n", &conn->c_faddr, 993 rds_ib_wc_status_str(wc.status));
867 wc.status);
868 }
869 } 994 }
870 995
996 /*
997 * It's very important that we only free this ring entry if we've truly
998 * freed the resources allocated to the entry. The refilling path can
999 * leak if we don't.
1000 */
871 rds_ib_ring_free(&ic->i_recv_ring, 1); 1001 rds_ib_ring_free(&ic->i_recv_ring, 1);
872 } 1002 }
873} 1003}
@@ -897,11 +1027,8 @@ void rds_ib_recv_tasklet_fn(unsigned long data)
897 if (rds_ib_ring_empty(&ic->i_recv_ring)) 1027 if (rds_ib_ring_empty(&ic->i_recv_ring))
898 rds_ib_stats_inc(s_ib_rx_ring_empty); 1028 rds_ib_stats_inc(s_ib_rx_ring_empty);
899 1029
900 /*
901 * If the ring is running low, then schedule the thread to refill.
902 */
903 if (rds_ib_ring_low(&ic->i_recv_ring)) 1030 if (rds_ib_ring_low(&ic->i_recv_ring))
904 queue_delayed_work(rds_wq, &conn->c_recv_w, 0); 1031 rds_ib_recv_refill(conn, 0);
905} 1032}
906 1033
907int rds_ib_recv(struct rds_connection *conn) 1034int rds_ib_recv(struct rds_connection *conn)
@@ -910,25 +1037,13 @@ int rds_ib_recv(struct rds_connection *conn)
910 int ret = 0; 1037 int ret = 0;
911 1038
912 rdsdebug("conn %p\n", conn); 1039 rdsdebug("conn %p\n", conn);
913
914 /*
915 * If we get a temporary posting failure in this context then
916 * we're really low and we want the caller to back off for a bit.
917 */
918 mutex_lock(&ic->i_recv_mutex);
919 if (rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0))
920 ret = -ENOMEM;
921 else
922 rds_ib_stats_inc(s_ib_rx_refill_from_thread);
923 mutex_unlock(&ic->i_recv_mutex);
924
925 if (rds_conn_up(conn)) 1040 if (rds_conn_up(conn))
926 rds_ib_attempt_ack(ic); 1041 rds_ib_attempt_ack(ic);
927 1042
928 return ret; 1043 return ret;
929} 1044}
930 1045
931int __init rds_ib_recv_init(void) 1046int rds_ib_recv_init(void)
932{ 1047{
933 struct sysinfo si; 1048 struct sysinfo si;
934 int ret = -ENOMEM; 1049 int ret = -ENOMEM;
@@ -939,14 +1054,14 @@ int __init rds_ib_recv_init(void)
939 1054
940 rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming", 1055 rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming",
941 sizeof(struct rds_ib_incoming), 1056 sizeof(struct rds_ib_incoming),
942 0, 0, NULL); 1057 0, SLAB_HWCACHE_ALIGN, NULL);
943 if (rds_ib_incoming_slab == NULL) 1058 if (!rds_ib_incoming_slab)
944 goto out; 1059 goto out;
945 1060
946 rds_ib_frag_slab = kmem_cache_create("rds_ib_frag", 1061 rds_ib_frag_slab = kmem_cache_create("rds_ib_frag",
947 sizeof(struct rds_page_frag), 1062 sizeof(struct rds_page_frag),
948 0, 0, NULL); 1063 0, SLAB_HWCACHE_ALIGN, NULL);
949 if (rds_ib_frag_slab == NULL) 1064 if (!rds_ib_frag_slab)
950 kmem_cache_destroy(rds_ib_incoming_slab); 1065 kmem_cache_destroy(rds_ib_incoming_slab);
951 else 1066 else
952 ret = 0; 1067 ret = 0;