diff options
author | Jens Axboe <jens.axboe@oracle.com> | 2007-11-07 02:30:13 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2008-01-28 17:53:31 -0500 |
commit | 9c55e01c0cc835818475a6ce8c4d684df9949ac8 (patch) | |
tree | 1115311436677f837a4b477e3fd23c5e0ae184ef /net/core | |
parent | bbdfc2f70610bebb841d0874dc901c648308e43a (diff) |
[TCP]: Splice receive support.
Support for network splice receive.
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/skbuff.c | 246 |
1 files changed, 246 insertions, 0 deletions
diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b6283779e93d..98420f9c4b6d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c | |||
@@ -52,6 +52,7 @@ | |||
52 | #endif | 52 | #endif |
53 | #include <linux/string.h> | 53 | #include <linux/string.h> |
54 | #include <linux/skbuff.h> | 54 | #include <linux/skbuff.h> |
55 | #include <linux/splice.h> | ||
55 | #include <linux/cache.h> | 56 | #include <linux/cache.h> |
56 | #include <linux/rtnetlink.h> | 57 | #include <linux/rtnetlink.h> |
57 | #include <linux/init.h> | 58 | #include <linux/init.h> |
@@ -71,6 +72,40 @@ | |||
71 | static struct kmem_cache *skbuff_head_cache __read_mostly; | 72 | static struct kmem_cache *skbuff_head_cache __read_mostly; |
72 | static struct kmem_cache *skbuff_fclone_cache __read_mostly; | 73 | static struct kmem_cache *skbuff_fclone_cache __read_mostly; |
73 | 74 | ||
75 | static void sock_pipe_buf_release(struct pipe_inode_info *pipe, | ||
76 | struct pipe_buffer *buf) | ||
77 | { | ||
78 | struct sk_buff *skb = (struct sk_buff *) buf->private; | ||
79 | |||
80 | kfree_skb(skb); | ||
81 | } | ||
82 | |||
83 | static void sock_pipe_buf_get(struct pipe_inode_info *pipe, | ||
84 | struct pipe_buffer *buf) | ||
85 | { | ||
86 | struct sk_buff *skb = (struct sk_buff *) buf->private; | ||
87 | |||
88 | skb_get(skb); | ||
89 | } | ||
90 | |||
91 | static int sock_pipe_buf_steal(struct pipe_inode_info *pipe, | ||
92 | struct pipe_buffer *buf) | ||
93 | { | ||
94 | return 1; | ||
95 | } | ||
96 | |||
97 | |||
98 | /* Pipe buffer operations for a socket. */ | ||
99 | static struct pipe_buf_operations sock_pipe_buf_ops = { | ||
100 | .can_merge = 0, | ||
101 | .map = generic_pipe_buf_map, | ||
102 | .unmap = generic_pipe_buf_unmap, | ||
103 | .confirm = generic_pipe_buf_confirm, | ||
104 | .release = sock_pipe_buf_release, | ||
105 | .steal = sock_pipe_buf_steal, | ||
106 | .get = sock_pipe_buf_get, | ||
107 | }; | ||
108 | |||
74 | /* | 109 | /* |
75 | * Keep out-of-line to prevent kernel bloat. | 110 | * Keep out-of-line to prevent kernel bloat. |
76 | * __builtin_return_address is not used because it is not always | 111 | * __builtin_return_address is not used because it is not always |
@@ -1122,6 +1157,217 @@ fault: | |||
1122 | return -EFAULT; | 1157 | return -EFAULT; |
1123 | } | 1158 | } |
1124 | 1159 | ||
1160 | /* | ||
1161 | * Callback from splice_to_pipe(), if we need to release some pages | ||
1162 | * at the end of the spd in case we error'ed out in filling the pipe. | ||
1163 | */ | ||
1164 | static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) | ||
1165 | { | ||
1166 | struct sk_buff *skb = (struct sk_buff *) spd->partial[i].private; | ||
1167 | |||
1168 | kfree_skb(skb); | ||
1169 | } | ||
1170 | |||
1171 | /* | ||
1172 | * Fill page/offset/length into spd, if it can hold more pages. | ||
1173 | */ | ||
1174 | static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page, | ||
1175 | unsigned int len, unsigned int offset, | ||
1176 | struct sk_buff *skb) | ||
1177 | { | ||
1178 | if (unlikely(spd->nr_pages == PIPE_BUFFERS)) | ||
1179 | return 1; | ||
1180 | |||
1181 | spd->pages[spd->nr_pages] = page; | ||
1182 | spd->partial[spd->nr_pages].len = len; | ||
1183 | spd->partial[spd->nr_pages].offset = offset; | ||
1184 | spd->partial[spd->nr_pages].private = (unsigned long) skb_get(skb); | ||
1185 | spd->nr_pages++; | ||
1186 | return 0; | ||
1187 | } | ||
1188 | |||
1189 | /* | ||
1190 | * Map linear and fragment data from the skb to spd. Returns number of | ||
1191 | * pages mapped. | ||
1192 | */ | ||
1193 | static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, | ||
1194 | unsigned int *total_len, | ||
1195 | struct splice_pipe_desc *spd) | ||
1196 | { | ||
1197 | unsigned int nr_pages = spd->nr_pages; | ||
1198 | unsigned int poff, plen, len, toff, tlen; | ||
1199 | int headlen, seg; | ||
1200 | |||
1201 | toff = *offset; | ||
1202 | tlen = *total_len; | ||
1203 | if (!tlen) | ||
1204 | goto err; | ||
1205 | |||
1206 | /* | ||
1207 | * if the offset is greater than the linear part, go directly to | ||
1208 | * the fragments. | ||
1209 | */ | ||
1210 | headlen = skb_headlen(skb); | ||
1211 | if (toff >= headlen) { | ||
1212 | toff -= headlen; | ||
1213 | goto map_frag; | ||
1214 | } | ||
1215 | |||
1216 | /* | ||
1217 | * first map the linear region into the pages/partial map, skipping | ||
1218 | * any potential initial offset. | ||
1219 | */ | ||
1220 | len = 0; | ||
1221 | while (len < headlen) { | ||
1222 | void *p = skb->data + len; | ||
1223 | |||
1224 | poff = (unsigned long) p & (PAGE_SIZE - 1); | ||
1225 | plen = min_t(unsigned int, headlen - len, PAGE_SIZE - poff); | ||
1226 | len += plen; | ||
1227 | |||
1228 | if (toff) { | ||
1229 | if (plen <= toff) { | ||
1230 | toff -= plen; | ||
1231 | continue; | ||
1232 | } | ||
1233 | plen -= toff; | ||
1234 | poff += toff; | ||
1235 | toff = 0; | ||
1236 | } | ||
1237 | |||
1238 | plen = min(plen, tlen); | ||
1239 | if (!plen) | ||
1240 | break; | ||
1241 | |||
1242 | /* | ||
1243 | * just jump directly to update and return, no point | ||
1244 | * in going over fragments when the output is full. | ||
1245 | */ | ||
1246 | if (spd_fill_page(spd, virt_to_page(p), plen, poff, skb)) | ||
1247 | goto done; | ||
1248 | |||
1249 | tlen -= plen; | ||
1250 | } | ||
1251 | |||
1252 | /* | ||
1253 | * then map the fragments | ||
1254 | */ | ||
1255 | map_frag: | ||
1256 | for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { | ||
1257 | const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; | ||
1258 | |||
1259 | plen = f->size; | ||
1260 | poff = f->page_offset; | ||
1261 | |||
1262 | if (toff) { | ||
1263 | if (plen <= toff) { | ||
1264 | toff -= plen; | ||
1265 | continue; | ||
1266 | } | ||
1267 | plen -= toff; | ||
1268 | poff += toff; | ||
1269 | toff = 0; | ||
1270 | } | ||
1271 | |||
1272 | plen = min(plen, tlen); | ||
1273 | if (!plen) | ||
1274 | break; | ||
1275 | |||
1276 | if (spd_fill_page(spd, f->page, plen, poff, skb)) | ||
1277 | break; | ||
1278 | |||
1279 | tlen -= plen; | ||
1280 | } | ||
1281 | |||
1282 | done: | ||
1283 | if (spd->nr_pages - nr_pages) { | ||
1284 | *offset = 0; | ||
1285 | *total_len = tlen; | ||
1286 | return 0; | ||
1287 | } | ||
1288 | err: | ||
1289 | return 1; | ||
1290 | } | ||
1291 | |||
1292 | /* | ||
1293 | * Map data from the skb to a pipe. Should handle both the linear part, | ||
1294 | * the fragments, and the frag list. It does NOT handle frag lists within | ||
1295 | * the frag list, if such a thing exists. We'd probably need to recurse to | ||
1296 | * handle that cleanly. | ||
1297 | */ | ||
1298 | int skb_splice_bits(struct sk_buff *__skb, unsigned int offset, | ||
1299 | struct pipe_inode_info *pipe, unsigned int tlen, | ||
1300 | unsigned int flags) | ||
1301 | { | ||
1302 | struct partial_page partial[PIPE_BUFFERS]; | ||
1303 | struct page *pages[PIPE_BUFFERS]; | ||
1304 | struct splice_pipe_desc spd = { | ||
1305 | .pages = pages, | ||
1306 | .partial = partial, | ||
1307 | .flags = flags, | ||
1308 | .ops = &sock_pipe_buf_ops, | ||
1309 | .spd_release = sock_spd_release, | ||
1310 | }; | ||
1311 | struct sk_buff *skb; | ||
1312 | |||
1313 | /* | ||
1314 | * I'd love to avoid the clone here, but tcp_read_sock() | ||
1315 | * ignores reference counts and unconditonally kills the sk_buff | ||
1316 | * on return from the actor. | ||
1317 | */ | ||
1318 | skb = skb_clone(__skb, GFP_KERNEL); | ||
1319 | if (unlikely(!skb)) | ||
1320 | return -ENOMEM; | ||
1321 | |||
1322 | /* | ||
1323 | * __skb_splice_bits() only fails if the output has no room left, | ||
1324 | * so no point in going over the frag_list for the error case. | ||
1325 | */ | ||
1326 | if (__skb_splice_bits(skb, &offset, &tlen, &spd)) | ||
1327 | goto done; | ||
1328 | else if (!tlen) | ||
1329 | goto done; | ||
1330 | |||
1331 | /* | ||
1332 | * now see if we have a frag_list to map | ||
1333 | */ | ||
1334 | if (skb_shinfo(skb)->frag_list) { | ||
1335 | struct sk_buff *list = skb_shinfo(skb)->frag_list; | ||
1336 | |||
1337 | for (; list && tlen; list = list->next) { | ||
1338 | if (__skb_splice_bits(list, &offset, &tlen, &spd)) | ||
1339 | break; | ||
1340 | } | ||
1341 | } | ||
1342 | |||
1343 | done: | ||
1344 | /* | ||
1345 | * drop our reference to the clone, the pipe consumption will | ||
1346 | * drop the rest. | ||
1347 | */ | ||
1348 | kfree_skb(skb); | ||
1349 | |||
1350 | if (spd.nr_pages) { | ||
1351 | int ret; | ||
1352 | |||
1353 | /* | ||
1354 | * Drop the socket lock, otherwise we have reverse | ||
1355 | * locking dependencies between sk_lock and i_mutex | ||
1356 | * here as compared to sendfile(). We enter here | ||
1357 | * with the socket lock held, and splice_to_pipe() will | ||
1358 | * grab the pipe inode lock. For sendfile() emulation, | ||
1359 | * we call into ->sendpage() with the i_mutex lock held | ||
1360 | * and networking will grab the socket lock. | ||
1361 | */ | ||
1362 | release_sock(__skb->sk); | ||
1363 | ret = splice_to_pipe(pipe, &spd); | ||
1364 | lock_sock(__skb->sk); | ||
1365 | return ret; | ||
1366 | } | ||
1367 | |||
1368 | return 0; | ||
1369 | } | ||
1370 | |||
1125 | /** | 1371 | /** |
1126 | * skb_store_bits - store bits from kernel buffer to skb | 1372 | * skb_store_bits - store bits from kernel buffer to skb |
1127 | * @skb: destination buffer | 1373 | * @skb: destination buffer |