litmus-rt.git - The LITMUS^RT kernel.

Branch	Commit message	Author	Age
archive/unc-master-3.0	P-FP: fix BUG_ON releated to priority inheritance	Bjoern Brandenburg	13 years
archived-2013.1	uncachedev: mmap memory that is not cached by CPUs	Glenn Elliott	12 years
archived-private-master	Merge branch 'wip-2.6.34' into old-private-master	Andrea Bastoni	15 years
archived-semi-part	Merge branch 'wip-semi-part' of ssh://cvs/cvs/proj/litmus/repo/litmus2010 int...	Andrea Bastoni	15 years
demo	Further refinements	Jonathan Herman	14 years
ecrts-pgm-final	Merge branch 'wip-ecrts14-pgm' of ssh://rtsrv.cs.unc.edu/home/litmus/litmus-r...	Glenn Elliott	12 years
ecrts14-pgm-final	Merge branch 'wip-ecrts14-pgm' of ssh://rtsrv.cs.unc.edu/home/litmus/litmus-r...	Glenn Elliott	12 years
gpusync-rtss12	Final GPUSync implementation.	Glenn Elliott	12 years
gpusync/staging	Rename IKGLP R2DGLP.	Glenn Elliott	12 years
linux-tip	Merge branch 'slab/urgent' of git://git.kernel.org/pub/scm/linux/kernel/git/p...	Linus Torvalds	15 years
litmus2008-patch-series	add i386 feather-trace implementation	Bjoern B. Brandenburg	16 years
master	PSN-EDF: use inferred_sporadic_job_release_at	Bjoern Brandenburg	9 years
pgm	make it compile	Glenn Elliott	12 years
prop/litmus-signals	Infrastructure for Litmus signals.	Glenn Elliott	13 years
prop/robust-tie-break	Fixed bug in edf_higher_prio().	Glenn Elliott	13 years
staging	Fix tracepoint compilation error	Felipe Cerqueira	13 years
test	9/23/2016	Namhoon Kim	9 years
tracing-devel	Test kernel tracing events capabilities	Andrea Bastoni	16 years
v2.6.34-with-arm-patches	smsc911x: Add spinlocks around registers access	Catalin Marinas	15 years
v2015.1	Add ARM syscall def for get_current_budget	Bjoern Brandenburg	10 years
wip-2011.2-bbb	Litmus core: simplify np-section protocol	Bjoern B. Brandenburg	14 years
wip-2011.2-bbb-trace	Refactor sched_trace_log_message() -> debug_trace_log_message()	Andrea Bastoni	14 years
wip-2012.3-gpu	SOBLIV draining support for C-EDF.	Glenn Elliott	12 years
wip-2012.3-gpu-preport	pick up last C-RM file	Glenn Elliott	12 years
wip-2012.3-gpu-rtss13	Fix critical bug in GPU tracker.	Glenn Elliott	12 years
wip-2012.3-gpu-sobliv-budget-w-kshark	Proper sobliv draining and many bug fixes.	Glenn Elliott	12 years
wip-aedzl-final	Make it easier to compile AEDZL interfaces in liblitmus.	Glenn Elliott	15 years
wip-aedzl-revised	Add sched_trace data for Apative EDZL	Glenn Elliott	15 years
wip-arbit-deadline	Fix compilation bug.	Glenn Elliott	13 years
wip-aux-tasks	Description of refined aux task inheritance.	Glenn Elliott	13 years
wip-bbb	GSN-EDF & Core: improve debug TRACE'ing for NP sections	Bjoern B. Brandenburg	14 years
wip-bbb-prio-don	use correct timestamp	Bjoern B. Brandenburg	14 years
wip-better-break	Implement hash-based EDF tie-breaking.	Glenn Elliott	13 years
wip-binary-heap	Make C-EDF work with simplified binheap_delete	Glenn Elliott	13 years
wip-budget	Added support for choices in budget policy enforcement.	Glenn Elliott	15 years
wip-color	Summarize schedulability with final record	Jonathan Herman	13 years
wip-color-jlh	sched_color: Fixed two bugs causing crashing on experiment restart and a rare...	Jonathan Herman	13 years
wip-d10-hz1000	Enable HZ=1000 on District 10	Bjoern B. Brandenburg	15 years
wip-default-clustering	Feature: Make default C-EDF clustering compile-time configurable.	Glenn Elliott	15 years
wip-dissipation-jerickso	Update from 2.6.36 to 2.6.36.4	Jeremy Erickson	11 years
wip-dissipation2-jerickso	Update 2.6.36 to 2.6.36.4	Jeremy Erickson	11 years
wip-ecrts14-pgm	Merge branch 'wip-ecrts14-pgm' of ssh://rtsrv.cs.unc.edu/home/litmus/litmus-r...	Glenn Elliott	12 years
wip-edf-hsb	last tested version	Jonathan Herman	14 years
wip-edf-os	Lookup table EDF-os	Jeremy Erickson	12 years
wip-edf-tie-break	Merge branch 'wip-edf-tie-break' of ssh://rtsrv.cs.unc.edu/home/litmus/litmus...	Glenn Elliott	13 years
wip-edzl-critique	Use hr_timer's active checks instead of having own flag.	Glenn Elliott	15 years
wip-edzl-final	Implementation of the EDZL scheduler.	Glenn Elliott	15 years
wip-edzl-revised	Clean up comments.	Glenn Elliott	15 years
wip-events	Added support for tracing arbitrary actions.	Jonathan Herman	15 years
wip-extra-debug	DBG: add additional tracing	Bjoern B. Brandenburg	15 years
wip-fix-switch-jerickso	Attempt to fix race condition with plugin switching	Jeremy Erickson	15 years
wip-fix3	sched: show length of runqueue clock deactivation in /proc/sched_debug	Bjoern B. Brandenburg	15 years
wip-fmlp-dequeue	Improve FMLP queue management.	Glenn Elliott	14 years
wip-ft-irq-flag	Feather-Trace: keep track of interrupt-related interference.	Bjoern B. Brandenburg	14 years
wip-gpu-cleanup	Enable sched_trace log injection from userspace	Glenn Elliott	13 years
wip-gpu-interrupts	Remove option for threading of all softirqs.	Glenn Elliott	14 years
wip-gpu-rtas12	Generalized GPU cost predictors + EWMA. (untested)	Glenn Elliott	13 years
wip-gpu-rtss12	Final GPUSync implementation.	Glenn Elliott	13 years
wip-gpu-rtss12-srp	experimental changes to support GPUs under SRP	Glenn Elliott	13 years
wip-gpusync-merge	Cleanup priority tracking for budget enforcement.	Glenn Elliott	12 years
wip-ikglp	Move RSM and IKGLP imp. to own .c files	Glenn Elliott	13 years
wip-k-fmlp	Merge branch 'mpi-master' into wip-k-fmlp	Glenn Elliott	14 years
wip-kernel-coloring	Added recolor syscall	Namhoon Kim	7 years
wip-kernthreads	Kludge work-queue processing into klitirqd.	Glenn Elliott	15 years
wip-klmirqd-to-aux	Allow klmirqd threads to be given names.	Glenn Elliott	13 years
wip-kshark	Merge branch 'mpi-staging' into wip-kshark	Jonathan Herman	13 years
wip-litmus-3.2	Merge commit 'v3.2' into litmus-staging	Andrea Bastoni	13 years
wip-litmus2011.2	Cleanup: Coding conformance for affinity stuff.	Glenn Elliott	14 years
wip-litmus3.0-2011.2	Feather-Trace: keep track of interrupt-related interference.	Bjoern B. Brandenburg	14 years
wip-master-2.6.33-rt	Avoid deadlock when switching task policy to BACKGROUND (ugly)	Andrea Bastoni	15 years
wip-mc	Removed ARM-specific hacks which disabled less common mixed-criticality featu...	Jonathan Herman	12 years
wip-mc-bipasa	MC-EDF added	bipasa chattopadhyay	13 years
wip-mc-jerickso	Split C/D queues	Jeremy Erickson	15 years
wip-mc2-cache-slack	Manually patched mc^2 related code	Ming Yang	10 years
wip-mcrit-mac	cosmetic	Mac Mollison	15 years
wip-merge-3.0	Prevent Linux to send IPI and queue tasks on remote CPUs.	Andrea Bastoni	14 years
wip-merge-v3.0	Prevent Linux to send IPI and queue tasks on remote CPUs.	Andrea Bastoni	14 years
wip-migration-affinity	NULL affinity dereference in C-EDF.	Glenn Elliott	14 years
wip-mmap-uncache	share branch with others	Glenn Elliott	13 years
wip-modechange	RTSS 2017 submission	Namhoon Kim	8 years
wip-nested-locking	Appears to be working.	Bryan Ward	12 years
wip-omlp-gedf	First implementation of G-OMLP.	Glenn Elliott	15 years
wip-pai	Some cleanup of PAI	Glenn Elliott	14 years
wip-percore-lib	9/21/2016	Namhoon Kim	9 years
wip-performance	CONFIG_DONT_PREEMPT_ON_TIE: Don't preeempt a scheduled task on priority tie.	Glenn Elliott	14 years
wip-pgm	Add PGM support to C-FL	Glenn Elliott	12 years
wip-pgm-split	First draft of C-FL-split	Namhoon Kim	12 years
wip-pm-ovd	Add preemption-and-migration overhead tracing support	Andrea Bastoni	15 years
wip-prio-inh	P-EDF updated to use the generic pi framework.	Glenn Elliott	15 years
wip-prioq-dgl	BUG FIX: Support DGLs with PRIOQ_MUTEX	Glenn Elliott	13 years
wip-refactored-gedf	Generalizd architecture for GEDF-style scheduelrs to reduce code redundancy.	Glenn Elliott	15 years
wip-release-master-fix	bugfix: release master CPU must signal task was picked	Bjoern B. Brandenburg	14 years
wip-robust-tie-break	EDF priority tie-breaks.	Glenn Elliott	13 years
wip-rt-kshark	Move task time accounting into the complete_job method.	Jonathan Herman	13 years
wip-rtas12-pgm	Scheduling of PGM jobs.	Glenn Elliott	13 years
wip-semi-part	Fix compile error with newer GCC	Jeremy Erickson	12 years
wip-semi-part-edfos-jerickso	Use initial CPU set by client	Jeremy Erickson	12 years
wip-shared-lib	TODO: Fix condition checks in replicate_page_move_mapping()	Namhoon Kim	9 years
wip-shared-lib2	RTAS 2017 Submission ver.	Namhoon Kim	9 years
wip-shared-mem	Initial commit for shared library	Namhoon Kim	9 years
wip-splitting-jerickso	Fix release behavior	Jeremy Erickson	13 years
wip-splitting-omlp-jerickso	Bjoern's Dissertation Code with Priority Donation	Jeremy Erickson	13 years
wip-stage-binheap	An efficient binary heap implementation.	Glenn Elliott	13 years
wip-sun-port	Dynamic memory allocation and clean exit for FeatherTrace	Christopher Kenna	15 years
wip-timer-trace	bugfix: C-EDF, clear scheduled field of the correct CPU upon task_exit	Andrea Bastoni	15 years
wip-tracepoints	Add kernel-style events for sched_trace_XXX() functions	Andrea Bastoni	14 years

Tag	Download	Author	Age
2015.1	commit 8e51b37822...	Bjoern Brandenburg	10 years
2013.1	commit bcaacec1ca...	Glenn Elliott	12 years
2012.3	commit c158b5fbe4...	Jonathan Herman	13 years
2012.2	commit b53c479a0f...	Glenn Elliott	13 years
2012.1	commit 83b11ea1c6...	Bjoern B. Brandenburg	14 years
rtas12-mc-beta-exp	commit 8e236ee20f...	Christopher Kenna	14 years
2011.1	commit d11808b5c6...	Christopher Kenna	15 years
v2.6.37-rc4	commit e8a7e48bb2...	Linus Torvalds	15 years
v2.6.37-rc3	commit 3561d43fd2...	Linus Torvalds	15 years
v2.6.37-rc2	commit e53beacd23...	Linus Torvalds	15 years
v2.6.37-rc1	commit c8ddb2713c...	Linus Torvalds	15 years
v2.6.36	commit f6f94e2ab1...	Linus Torvalds	15 years
2010.2	commit 5c5456402d...	Bjoern B. Brandenburg	15 years
v2.6.36-rc8	commit cd07202cc8...	Linus Torvalds	15 years
v2.6.36-rc7	commit cb655d0f3d...	Linus Torvalds	15 years
v2.6.36-rc6	commit 899611ee7d...	Linus Torvalds	15 years
v2.6.36-rc5	commit b30a3f6257...	Linus Torvalds	15 years
v2.6.36-rc4	commit 49553c2ef8...	Linus Torvalds	15 years
v2.6.36-rc3	commit 2bfc96a127...	Linus Torvalds	15 years
v2.6.36-rc2	commit 76be97c1fc...	Linus Torvalds	15 years
v2.6.36-rc1	commit da5cabf80e...	Linus Torvalds	15 years
v2.6.35	commit 9fe6206f40...	Linus Torvalds	15 years
v2.6.35-rc6	commit b37fa16e78...	Linus Torvalds	15 years
v2.6.35-rc5	commit 1c5474a65b...	Linus Torvalds	15 years
v2.6.35-rc4	commit 815c4163b6...	Linus Torvalds	15 years
v2.6.35-rc3	commit 7e27d6e778...	Linus Torvalds	15 years
v2.6.35-rc2	commit e44a21b726...	Linus Torvalds	15 years
v2.6.35-rc1	commit 67a3e12b05...	Linus Torvalds	15 years
2010.1	commit 7c1ff4c544...	Andrea Bastoni	15 years
v2.6.34	commit e40152ee1e...	Linus Torvalds	15 years
v2.6.33.4	commit 4640b4e7d9...	Greg Kroah-Hartman	15 years
v2.6.34-rc7	commit b57f95a382...	Linus Torvalds	15 years
v2.6.34-rc6	commit 66f41d4c5c...	Linus Torvalds	15 years
v2.6.33.3	commit 3e7ad8ed97...	Greg Kroah-Hartman	15 years
v2.6.34-rc5	commit 01bf0b6457...	Linus Torvalds	15 years
v2.6.34-rc4	commit 0d0fb0f9c5...	Linus Torvalds	15 years
v2.6.33.2	commit 19f00f070c...	Greg Kroah-Hartman	15 years
v2.6.34-rc3	commit 2eaa9cfdf3...	Linus Torvalds	15 years
v2.6.34-rc2	commit 220bf991b0...	Linus Torvalds	16 years
v2.6.33.1	commit dbdafe5ccf...	Greg Kroah-Hartman	16 years
v2.6.34-rc1	commit 57d54889cd...	Linus Torvalds	16 years
v2.6.33	commit 60b341b778...	Linus Torvalds	16 years
v2.6.33-rc8	commit 724e6d3fe8...	Linus Torvalds	16 years
v2.6.33-rc7	commit 29275254ca...	Linus Torvalds	16 years
v2.6.33-rc6	commit abe94c756c...	Linus Torvalds	16 years
v2.6.33-rc5	commit 92dcffb916...	Linus Torvalds	16 years
v2.6.33-rc4	commit 7284ce6c9f...	Linus Torvalds	16 years
v2.6.33-rc3	commit 74d2e4f8d7...	Linus Torvalds	16 years
v2.6.33-rc2	commit 6b7b284958...	Linus Torvalds	16 years
v2.6.33-rc1	commit 55639353a0...	Linus Torvalds	16 years
v2.6.32	commit 22763c5cf3...	Linus Torvalds	16 years
v2.6.32-rc8	commit 648f4e3e50...	Linus Torvalds	16 years
v2.6.32-rc7	commit 156171c71a...	Linus Torvalds	16 years
v2.6.32-rc6	commit b419148e56...	Linus Torvalds	16 years
v2.6.32-rc5	commit 012abeea66...	Linus Torvalds	16 years
v2.6.32-rc4	commit 161291396e...	Linus Torvalds	16 years
v2.6.32-rc3	commit 374576a8b6...	Linus Torvalds	16 years
v2.6.32-rc1	commit 17d857be64...	Linus Torvalds	16 years
v2.6.32-rc2	commit 17d857be64...	Linus Torvalds	16 years
v2.6.31	commit 74fca6a428...	Linus Torvalds	16 years
v2.6.31-rc9	commit e07cccf404...	Linus Torvalds	16 years
v2.6.31-rc8	commit 326ba5010a...	Linus Torvalds	16 years
v2.6.31-rc7	commit 422bef879e...	Linus Torvalds	16 years
v2.6.31-rc6	commit 64f1607ffb...	Linus Torvalds	16 years
v2.6.31-rc5	commit ed680c4ad4...	Linus Torvalds	16 years
v2.6.31-rc4	commit 4be3bd7849...	Linus Torvalds	16 years
v2.6.31-rc3	commit 6847e154e3...	Linus Torvalds	16 years
v2.6.31-rc2	commit 8e4a718ff3...	Linus Torvalds	16 years
v2.6.31-rc1	commit 28d0325ce6...	Linus Torvalds	16 years
v2.6.30	commit 07a2039b8e...	Linus Torvalds	16 years
v2.6.30-rc8	commit 9fa7eb283c...	Linus Torvalds	16 years
v2.6.30-rc7	commit 59a3759d0f...	Linus Torvalds	16 years
v2.6.30-rc6	commit 1406de8e11...	Linus Torvalds	16 years
v2.6.30-rc5	commit 091bf7624d...	Linus Torvalds	16 years
v2.6.30-rc4	commit 091438dd56...	Linus Torvalds	16 years
v2.6.30-rc3	commit 0910697403...	Linus Torvalds	16 years
v2.6.30-rc2	commit 0882e8dd3a...	Linus Torvalds	16 years
v2.6.30-rc1	commit 577c9c456f...	Linus Torvalds	16 years
v2.6.29	commit 8e0ee43bc2...	Linus Torvalds	16 years
v2.6.29-rc8	commit 041b62374c...	Linus Torvalds	17 years
v2.6.29-rc7	commit fec6c6fec3...	Linus Torvalds	17 years
v2.6.29-rc6	commit 20f4d6c3a2...	Linus Torvalds	17 years
v2.6.29-rc5	commit d2f8d7ee1a...	Linus Torvalds	17 years
v2.6.29-rc4	commit 8e4921515c...	Linus Torvalds	17 years
v2.6.29-rc3	commit 18e352e4a7...	Linus Torvalds	17 years
v2.6.29-rc2	commit 1de9e8e70f...	Linus Torvalds	17 years
v2.6.29-rc1	commit c59765042f...	Linus Torvalds	17 years
v2.6.28	commit 4a6908a3a0...	Linus Torvalds	17 years
v2.6.28-rc9	commit 929096fe9f...	Linus Torvalds	17 years
v2.6.28-rc8	commit 8b1fae4e42...	Linus Torvalds	17 years
v2.6.28-rc7	commit 061e41fdb5...	Linus Torvalds	17 years
v2.6.28-rc6	commit 13d428afc0...	Linus Torvalds	17 years
v2.6.28-rc5	commit 9bf1a2445f...	Linus Torvalds	17 years
v2.6.28-rc4	commit f7160c7573...	Linus Torvalds	17 years
v2.6.28-rc3	commit 45beca08dd...	Linus Torvalds	17 years
v2.6.28-rc2	commit 0173a3265b...	Linus Torvalds	17 years
v2.6.28-rc1	commit 57f8f7b60d...	Linus Torvalds	17 years
v2.6.27	commit 3fa8749e58...	Linus Torvalds	17 years
v2.6.27-rc9	commit 4330ed8ed4...	Linus Torvalds	17 years
v2.6.27-rc8	commit 94aca1dac6...	Linus Torvalds	17 years
v2.6.27-rc7	commit 72d31053f6...	Linus Torvalds	17 years
v2.6.27-rc6	commit adee14b2e1...	Linus Torvalds	17 years
v2.6.27-rc5	commit 24342c34a0...	Linus Torvalds	17 years
v2.6.27-rc4	commit 6a55617ed5...	Linus Torvalds	17 years
v2.6.27-rc3	commit 30a2f3c60a...	Linus Torvalds	17 years
v2.6.27-rc2	commit 0967d61ea0...	Linus Torvalds	17 years
v2.6.27-rc1	commit 6e86841d05...	Linus Torvalds	17 years
v2.6.26	commit bce7f793da...	Linus Torvalds	17 years
v2.6.26-rc9	commit b7279469d6...	Linus Torvalds	17 years
v2.6.26-rc8	commit 543cf4cb3f...	Linus Torvalds	17 years
v2.6.26-rc7	commit d70ac829b7...	Linus Torvalds	17 years
v2.6.26-rc6	commit 5dd34572ad...	Linus Torvalds	17 years
v2.6.26-rc5	commit 53c8ba9540...	Linus Torvalds	17 years
v2.6.26-rc4	commit e490517a03...	Linus Torvalds	17 years
v2.6.26-rc3	commit b8291ad07a...	Linus Torvalds	17 years
v2.6.26-rc2	commit 492c2e476e...	Linus Torvalds	17 years
v2.6.26-rc1	commit 2ddcca36c8...	Linus Torvalds	17 years
v2.6.25	commit 4b119e21d0...	Linus Torvalds	17 years
v2.6.25-rc9	commit 120dd64cac...	Linus Torvalds	17 years
v2.6.25-rc8	commit 0e81a8ae37...	Linus Torvalds	17 years
v2.6.25-rc7	commit 05dda977f2...	Linus Torvalds	17 years
v2.6.25-rc6	commit a978b30af3...	Linus Torvalds	18 years
v2.6.25-rc5	commit cdeeeae056...	Linus Torvalds	18 years
v2.6.25-rc4	commit 29e8c3c304...	Linus Torvalds	18 years
v2.6.25-rc3	commit bfa274e243...	Linus Torvalds	18 years
v2.6.25-rc2	commit 101142c37b...	Linus Torvalds	18 years
v2.6.25-rc1	commit 19af35546d...	Linus Torvalds	18 years
v2.6.24	commit 49914084e7...	Linus Torvalds	18 years
v2.6.24-rc8	commit cbd9c88369...	Linus Torvalds	18 years
v2.6.24-rc7	commit 3ce5445046...	Linus Torvalds	18 years
v2.6.24-rc6	commit ea67db4cdb...	Linus Torvalds	18 years
v2.6.24-rc5	commit 82d29bf6dc...	Linus Torvalds	18 years
v2.6.24-rc4	commit 09b56adc98...	Linus Torvalds	18 years
v2.6.24-rc3	commit d9f8bcbf67...	Linus Torvalds	18 years
v2.6.24-rc2	commit dbeeb816e8...	Linus Torvalds	18 years
v2.6.24-rc1	commit c9927c2bf4...	Linus Torvalds	18 years
v2.6.23	commit bbf25010f1...	Linus Torvalds	18 years
v2.6.23-rc9	commit 3146b39c18...	Linus Torvalds	18 years
v2.6.23-rc8	commit 4942de4a0e...	Linus Torvalds	18 years
v2.6.23-rc7	commit 81cfe79b9c...	Linus Torvalds	18 years
v2.6.23-rc6	commit 0d4cbb5e7f...	Linus Torvalds	18 years
v2.6.23-rc5	commit 40ffbfad6b...	Linus Torvalds	18 years
v2.6.23-rc4	commit b07d68b5ca...	Linus Torvalds	18 years
v2.6.23-rc3	commit 39d3520c92...	Linus Torvalds	18 years
v2.6.23-rc2	commit d4ac2477fa...	Linus Torvalds	18 years
v2.6.23-rc1	commit f695baf2df...	Linus Torvalds	18 years
v2.6.22	commit 7dcca30a32...	Linus Torvalds	18 years
v2.6.22-rc7	commit a38d6181ff...	Linus Torvalds	18 years
v2.6.22-rc6	commit 189548642c...	Linus Torvalds	18 years
v2.6.22-rc5	commit 188e1f81ba...	Linus Torvalds	18 years
v2.6.22-rc4	commit 5ecd3100e6...	Linus Torvalds	18 years
v2.6.22-rc3	commit c420bc9f09...	Linus Torvalds	18 years
v2.6.22-rc2	commit 55b637c6a0...	Linus Torvalds	18 years
v2.6.22-rc1	commit 39403865d2...	Linus Torvalds	18 years
v2.6.21	commit de46c33745...	Linus Torvalds	18 years
v2.6.21-rc7	commit 94a05509a9...	Linus Torvalds	18 years
v2.6.21-rc6	commit a21bd69e15...	Linus Torvalds	18 years
v2.6.21-rc5	commit e0f2e3a06b...	Linus Torvalds	18 years
v2.6.21-rc4	commit db98e0b434...	Linus Torvalds	19 years
v2.6.21-rc3	commit 08e15e81a4...	Linus Torvalds	19 years
v2.6.21-rc2	commit 606135a308...	Linus Torvalds	19 years
v2.6.21-rc1	commit c8f71b01a5...	Linus Torvalds	19 years
v2.6.20	commit 62d0cfcb27...	Linus Torvalds	19 years
v2.6.20-rc7	commit f56df2f4db...	Linus Torvalds	19 years
v2.6.20-rc6	commit 99abfeafb5...	Linus Torvalds	19 years
v2.6.20-rc5	commit a8b3485287...	Linus Torvalds	19 years
v2.6.20-rc4	commit bf81b46482...	Linus Torvalds	19 years
v2.6.20-rc3	commit 669df1b478...	Linus Torvalds	19 years
v2.6.20-rc2	commit 3bf8ba38f3...	Linus Torvalds	19 years
v2.6.20-rc1	commit cc016448b0...	Linus Torvalds	19 years
v2.6.19	commit 0215ffb08c...	Linus Torvalds	19 years
v2.6.19-rc6	commit 44597f65f6...	Linus Torvalds	19 years
v2.6.19-rc5	commit 80c2188127...	Linus Torvalds	19 years
v2.6.19-rc4	commit ae99a78af3...	Linus Torvalds	19 years
v2.6.19-rc3	commit 7059abedd2...	Linus Torvalds	19 years
v2.6.19-rc2	commit b4bd8c6643...	Linus Torvalds	19 years
v2.6.19-rc1	commit d223a60106...	Linus Torvalds	19 years
v2.6.18	commit e478bec0ba...	Linus Torvalds	19 years
v2.6.18-rc7	commit 95064a75eb...	Linus Torvalds	19 years
v2.6.18-rc6	commit c336923b66...	Linus Torvalds	19 years
v2.6.18-rc5	commit 60d4684068...	Linus Torvalds	19 years
v2.6.18-rc4	commit 9f737633e6...	Linus Torvalds	19 years
v2.6.18-rc3	commit b6ff50833a...	Linus Torvalds	19 years
v2.6.18-rc2	commit 82d6897fef...	Linus Torvalds	19 years
v2.6.18-rc1	commit 120bda20c6...	Linus Torvalds	19 years
v2.6.17	commit 427abfa28a...	Linus Torvalds	19 years
v2.6.17-rc6	commit 1def630a6a...	Linus Torvalds	19 years
v2.6.17-rc5	commit a8bd60705a...	Linus Torvalds	19 years
v2.6.17-rc4	commit d8c3291c73...	Linus Torvalds	19 years
v2.6.17-rc3	commit 2be4d50295...	Linus Torvalds	19 years
v2.6.17-rc2	commit 8bbde0e6d5...	Linus Torvalds	19 years
v2.6.17-rc1	commit 6246b6128b...	Linus Torvalds	19 years
v2.6.16	commit 7705a8792b...	Linus Torvalds	20 years
v2.6.16-rc6	commit 535744878e...	Linus Torvalds	20 years
v2.6.16-rc5	commit b9a33cebac...	Linus Torvalds	20 years
v2.6.16-rc4	commit bd71c2b174...	Linus Torvalds	20 years
v2.6.16-rc3	commit e9bb4c9929...	Linus Torvalds	20 years
v2.6.16-rc2	commit 826eeb53a6...	Linus Torvalds	20 years
v2.6.16-rc1	commit 2664b25051...	Linus Torvalds	20 years
v2.6.15	commit 88026842b0...	Linus Torvalds	20 years
v2.6.15-rc7	commit f89f5948fc...	Linus Torvalds	20 years
v2.6.15-rc6	commit df7addbb45...	Linus Torvalds	20 years
v2.6.15-rc5	commit 436b0f76f2...	Linus Torvalds	20 years
v2.6.15-rc4	commit 5666c0947e...	Linus Torvalds	20 years
v2.6.15-rc3	commit 624f54be20...	Linus Torvalds	20 years
v2.6.15-rc2	commit 3bedff1d73...	Linus Torvalds	20 years
v2.6.15-rc1	commit cd52d1ee9a...	Linus Torvalds	20 years
v2.6.14	commit 741b2252a5...	Linus Torvalds	20 years
v2.6.14-rc5	commit 93918e9afc...	Linus Torvalds	20 years
v2.6.14-rc4	commit 907a426179...	Linus Torvalds	20 years
v2.6.14-rc3	commit 1c9426e8a5...	Linus Torvalds	20 years
v2.6.14-rc2	commit 676d55ae30...	Linus Torvalds	20 years
v2.6.14-rc1	commit 2f4ba45a75...	Linus Torvalds	20 years
v2.6.13	commit 02b3e4e2d7...	Linus Torvalds	20 years
v2.6.13-rc7	commit 0572e3da3f...	Linus Torvalds	20 years
v2.6.13-rc6	commit 6fc32179de...	Linus Torvalds	20 years
v2.6.13-rc5	commit 9a351e30d7...	Linus Torvalds	20 years
v2.6.13-rc4	commit 6395352334...	Linus Torvalds	20 years
v2.6.11	tree c39ae07f39...
v2.6.11-tree	tree c39ae07f39...
v2.6.12	commit 9ee1c939d1...
v2.6.12-rc2	commit 1da177e4c3...
v2.6.12-rc3	commit a2755a80f4...
v2.6.12-rc4	commit 88d7bd8cb9...
v2.6.12-rc5	commit 2a24ab628a...
v2.6.12-rc6	commit 7cef5677ef...
v2.6.13-rc1	commit 4c91aedb75...
v2.6.13-rc2	commit a18bcb7450...
v2.6.13-rc3	commit c32511e271...

/* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche, <flla@stud.uni-sb.de> * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> * Linus Torvalds, <torvalds@cs.helsinki.fi> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> */ /* * Changes: * Pedro Roque : Fast Retransmit/Recovery. * Two receive queues. * Retransmit queue handled by TCP. * Better retransmit timer handling. * New congestion avoidance. * Header prediction. * Variable renaming. * * Eric : Fast Retransmit. * Randy Scott : MSS option defines. * Eric Schenk : Fixes to slow start algorithm. * Eric Schenk : Yet another double ACK bug. * Eric Schenk : Delayed ACK bug fixes. * Eric Schenk : Floyd style fast retrans war avoidance. * David S. Miller : Don't allow zero congestion window. * Eric Schenk : Fix retransmitter so that it sends * next packet on ack of previous packet. * Andi Kleen : Moved open_request checking here * and process RSTs for open_requests. * Andi Kleen : Better prune_queue, and other fixes. * Andrey Savochkin: Fix RTT measurements in the presence of * timestamps. * Andrey Savochkin: Check sequence numbers correctly when * removing SACKs due to in sequence incoming * data segments. * Andi Kleen: Make sure we never ack data there is not * enough room for. Also make this condition * a fatal error if it might still happen. * Andi Kleen: Add tcp_measure_rcv_mss to make * connections with MSS<min(MTU,ann. MSS) * work without delayed acks. * Andi Kleen: Process packets with PSH set in the * fast path. * J Hadi Salim: ECN support * Andrei Gurtov, * Pasi Sarolahti, * Panu Kuhlberg: Experimental audit of TCP (re)transmission * engine. Lots of bugs are found. * Pasi Sarolahti: F-RTO for dealing with spurious RTOs */ #include <linux/mm.h> #include <linux/module.h> #include <linux/sysctl.h> #include <net/dst.h> #include <net/tcp.h> #include <net/inet_common.h> #include <linux/ipsec.h> #include <asm/unaligned.h> #include <net/netdma.h> int sysctl_tcp_timestamps __read_mostly = 1; int sysctl_tcp_window_scaling __read_mostly = 1; int sysctl_tcp_sack __read_mostly = 1; int sysctl_tcp_fack __read_mostly = 1; int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH; int sysctl_tcp_ecn __read_mostly; int sysctl_tcp_dsack __read_mostly = 1; int sysctl_tcp_app_win __read_mostly = 31; int sysctl_tcp_adv_win_scale __read_mostly = 2; int sysctl_tcp_stdurg __read_mostly; int sysctl_tcp_rfc1337 __read_mostly; int sysctl_tcp_max_orphans __read_mostly = NR_FILE; int sysctl_tcp_frto __read_mostly = 2; int sysctl_tcp_frto_response __read_mostly; int sysctl_tcp_nometrics_save __read_mostly; int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; int sysctl_tcp_abc __read_mostly; #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ #define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ #define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */ #define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ #define FLAG_DATA_SACKED 0x20 /* New SACK. */ #define FLAG_ECE 0x40 /* ECE in this ACK */ #define FLAG_DATA_LOST 0x80 /* SACK detected data lossage. */ #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ #define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */ #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ #define FLAG_NONHEAD_RETRANS_ACKED 0x1000 /* Non-head rexmitted data was ACKed */ #define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) #define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE) #define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) #define FLAG_ANY_PROGRESS (FLAG_FORWARD_PROGRESS|FLAG_SND_UNA_ADVANCED) #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) /* Adapt the MSS value used to make delayed ack decision to the * real world. */ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); const unsigned int lss = icsk->icsk_ack.last_seg_size; unsigned int len; icsk->icsk_ack.last_seg_size = 0; /* skb->len may jitter because of SACKs, even if peer * sends good full-sized frames. */ len = skb_shinfo(skb)->gso_size ? : skb->len; if (len >= icsk->icsk_ack.rcv_mss) { icsk->icsk_ack.rcv_mss = len; } else { /* Otherwise, we make more careful check taking into account, * that SACKs block is variable. * * "len" is invariant segment length, including TCP header. */ len += skb->data - skb_transport_header(skb); if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) || /* If PSH is not set, packet should be * full sized, provided peer TCP is not badly broken. * This observation (if it is correct 8)) allows * to handle super-low mtu links fairly. */ (len >= TCP_MIN_MSS + sizeof(struct tcphdr) && !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) { /* Subtract also invariant (if peer is RFC compliant), * tcp header plus fixed timestamp option length. * Resulting "len" is MSS free of SACK jitter. */ len -= tcp_sk(sk)->tcp_header_len; icsk->icsk_ack.last_seg_size = len; if (len == lss) { icsk->icsk_ack.rcv_mss = len; return; } } if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2; icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; } } static void tcp_incr_quickack(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss); if (quickacks == 0) quickacks = 2; if (quickacks > icsk->icsk_ack.quick) icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS); } void tcp_enter_quickack_mode(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); tcp_incr_quickack(sk); icsk->icsk_ack.pingpong = 0; icsk->icsk_ack.ato = TCP_ATO_MIN; } /* Send ACKs quickly, if "quick" count is not exhausted * and the session is not interactive. */ static inline int tcp_in_quickack_mode(const struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; } static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp) { if (tp->ecn_flags & TCP_ECN_OK) tp->ecn_flags |= TCP_ECN_QUEUE_CWR; } static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, struct sk_buff *skb) { if (tcp_hdr(skb)->cwr) tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; } static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp) { tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; } static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb) { if (tp->ecn_flags & TCP_ECN_OK) { if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags)) tp->ecn_flags |= TCP_ECN_DEMAND_CWR; /* Funny extension: if ECT is not set on a segment, * it is surely retransmit. It is not in ECN RFC, * but Linux follows this rule. */ else if (INET_ECN_is_not_ect((TCP_SKB_CB(skb)->flags))) tcp_enter_quickack_mode((struct sock *)tp); } } static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, struct tcphdr *th) { if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr)) tp->ecn_flags &= ~TCP_ECN_OK; } static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, struct tcphdr *th) { if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr)) tp->ecn_flags &= ~TCP_ECN_OK; } static inline int TCP_ECN_rcv_ecn_echo(struct tcp_sock *tp, struct tcphdr *th) { if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK)) return 1; return 0; } /* Buffer size and advertised window tuning. * * 1. Tuning sk->sk_sndbuf, when connection enters established state. */ static void tcp_fixup_sndbuf(struct sock *sk) { int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); if (sk->sk_sndbuf < 3 * sndmem) sk->sk_sndbuf = min(3 * sndmem, sysctl_tcp_wmem[2]); } /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) * * All tcp_full_space() is split to two parts: "network" buffer, allocated * forward and advertised in receiver window (tp->rcv_wnd) and * "application buffer", required to isolate scheduling/application * latencies from network. * window_clamp is maximal advertised window. It can be less than * tcp_full_space(), in this case tcp_full_space() - window_clamp * is reserved for "application" buffer. The less window_clamp is * the smoother our behaviour from viewpoint of network, but the lower * throughput and the higher sensitivity of the connection to losses. 8) * * rcv_ssthresh is more strict window_clamp used at "slow start" * phase to predict further behaviour of this connection. * It is used for two goals: * - to enforce header prediction at sender, even when application * requires some significant "application buffer". It is check #1. * - to prevent pruning of receive queue because of misprediction * of receiver window. Check #2. * * The scheme does not work when sender sends good segments opening * window and then starts to feed us spaghetti. But it should work * in common situations. Otherwise, we have to rely on queue collapsing. */ /* Slow part of check#2. */ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); /* Optimize this! */ int truesize = tcp_win_from_space(skb->truesize) >> 1; int window = tcp_win_from_space(sysctl_tcp_rmem[2]) >> 1; while (tp->rcv_ssthresh <= window) { if (truesize <= skb->len) return 2 * inet_csk(sk)->icsk_ack.rcv_mss; truesize >>= 1; window >>= 1; } return 0; } static void tcp_grow_window(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); /* Check #1 */ if (tp->rcv_ssthresh < tp->window_clamp && (int)tp->rcv_ssthresh < tcp_space(sk) && !tcp_memory_pressure) { int incr; /* Check #2. Increase window, if skb with such overhead * will fit to rcvbuf in future. */ if (tcp_win_from_space(skb->truesize) <= skb->len) incr = 2 * tp->advmss; else incr = __tcp_grow_window(sk, skb); if (incr) { tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp); inet_csk(sk)->icsk_ack.quick |= 1; } } } /* 3. Tuning rcvbuf, when connection enters established state. */ static void tcp_fixup_rcvbuf(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); /* Try to select rcvbuf so that 4 mss-sized segments * will fit to window and corresponding skbs will fit to our rcvbuf. * (was 3; 4 is minimum to allow fast retransmit to work.) */ while (tcp_win_from_space(rcvmem) < tp->advmss) rcvmem += 128; if (sk->sk_rcvbuf < 4 * rcvmem) sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]); } /* 4. Try to fixup all. It is made immediately after connection enters * established state. */ static void tcp_init_buffer_space(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); int maxwin; if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) tcp_fixup_rcvbuf(sk); if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) tcp_fixup_sndbuf(sk); tp->rcvq_space.space = tp->rcv_wnd; maxwin = tcp_full_space(sk); if (tp->window_clamp >= maxwin) { tp->window_clamp = maxwin; if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss) tp->window_clamp = max(maxwin - (maxwin >> sysctl_tcp_app_win), 4 * tp->advmss); } /* Force reservation of one segment. */ if (sysctl_tcp_app_win && tp->window_clamp > 2 * tp->advmss && tp->window_clamp + tp->advmss > maxwin) tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss); tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); tp->snd_cwnd_stamp = tcp_time_stamp; } /* 5. Recalculate window clamp after socket hit its memory bounds. */ static void tcp_clamp_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_ack.quick = 0; if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && !tcp_memory_pressure && atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), sysctl_tcp_rmem[2]); } if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); } /* Initialize RCV_MSS value. * RCV_MSS is an our guess about MSS used by the peer. * We haven't any direct information about the MSS. * It's better to underestimate the RCV_MSS rather than overestimate. * Overestimations make us ACKing less frequently than needed. * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss(). */ void tcp_initialize_rcv_mss(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); hint = min(hint, tp->rcv_wnd / 2); hint = min(hint, TCP_MIN_RCVMSS); hint = max(hint, TCP_MIN_MSS); inet_csk(sk)->icsk_ack.rcv_mss = hint; } /* Receiver "autotuning" code. * * The algorithm for RTT estimation w/o timestamps is based on * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL. * <http://www.lanl.gov/radiant/website/pubs/drs/lacsi2001.ps> * * More detail on this code can be found at * <http://www.psc.edu/~jheffner/senior_thesis.ps>, * though this reference is out of date. A new paper * is pending. */ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) { u32 new_sample = tp->rcv_rtt_est.rtt; long m = sample; if (m == 0) m = 1; if (new_sample != 0) { /* If we sample in larger samples in the non-timestamp * case, we could grossly overestimate the RTT especially * with chatty applications or bulk transfer apps which * are stalled on filesystem I/O. * * Also, since we are only going for a minimum in the * non-timestamp case, we do not smooth things out * else with timestamps disabled convergence takes too * long. */ if (!win_dep) { m -= (new_sample >> 3); new_sample += m; } else if (m < new_sample) new_sample = m << 3; } else { /* No previous measure. */ new_sample = m << 3; } if (tp->rcv_rtt_est.rtt != new_sample) tp->rcv_rtt_est.rtt = new_sample; } static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) { if (tp->rcv_rtt_est.time == 0) goto new_measure; if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) return; tcp_rcv_rtt_update(tp, jiffies - tp->rcv_rtt_est.time, 1); new_measure: tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd; tp->rcv_rtt_est.time = tcp_time_stamp; } static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); if (tp->rx_opt.rcv_tsecr && (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0); } /* * This function should be called every time data is copied to user space. * It calculates the appropriate TCP receive buffer space. */ void tcp_rcv_space_adjust(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); int time; int space; if (tp->rcvq_space.time == 0) goto new_measure; time = tcp_time_stamp - tp->rcvq_space.time; if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0) return; space = 2 * (tp->copied_seq - tp->rcvq_space.seq); space = max(tp->rcvq_space.space, space); if (tp->rcvq_space.space != space) { int rcvmem; tp->rcvq_space.space = space; if (sysctl_tcp_moderate_rcvbuf && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { int new_clamp = space; /* Receive space grows, normalize in order to * take into account packet headers and sk_buff * structure overhead. */ space /= tp->advmss; if (!space) space = 1; rcvmem = (tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff)); while (tcp_win_from_space(rcvmem) < tp->advmss) rcvmem += 128; space *= rcvmem; space = min(space, sysctl_tcp_rmem[2]); if (space > sk->sk_rcvbuf) { sk->sk_rcvbuf = space; /* Make the window clamp follow along. */ tp->window_clamp = new_clamp; } } } new_measure: tp->rcvq_space.seq = tp->copied_seq; tp->rcvq_space.time = tcp_time_stamp; } /* There is something which you must keep in mind when you analyze the * behavior of the tp->ato delayed ack timeout interval. When a * connection starts up, we want to ack as quickly as possible. The * problem is that "good" TCP's do slow start at the beginning of data * transmission. The means that until we send the first few ACK's the * sender will sit on his end and only queue most of his data, because * he can only send snd_cwnd unacked packets at any given time. For * each ACK we send, he increments snd_cwnd and transmits more of his * queue. -DaveM */ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); u32 now; inet_csk_schedule_ack(sk); tcp_measure_rcv_mss(sk, skb); tcp_rcv_rtt_measure(tp); now = tcp_time_stamp; if (!icsk->icsk_ack.ato) { /* The _first_ data packet received, initialize * delayed ACK engine. */ tcp_incr_quickack(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; } else { int m = now - icsk->icsk_ack.lrcvtime; if (m <= TCP_ATO_MIN / 2) { /* The fastest case is the first. */ icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2; } else if (m < icsk->icsk_ack.ato) { icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m; if (icsk->icsk_ack.ato > icsk->icsk_rto) icsk->icsk_ack.ato = icsk->icsk_rto; } else if (m > icsk->icsk_rto) { /* Too long gap. Apparently sender failed to * restart window, so that we send ACKs quickly. */ tcp_incr_quickack(sk); sk_mem_reclaim(sk); } } icsk->icsk_ack.lrcvtime = now; TCP_ECN_check_ce(tp, skb); if (skb->len >= 128) tcp_grow_window(sk, skb); } static u32 tcp_rto_min(struct sock *sk) { struct dst_entry *dst = __sk_dst_get(sk); u32 rto_min = TCP_RTO_MIN; if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) rto_min = dst_metric(dst, RTAX_RTO_MIN); return rto_min; } /* Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 * piece by Van Jacobson. * NOTE: the next three routines used to be one big routine. * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) { struct tcp_sock *tp = tcp_sk(sk); long m = mrtt; /* RTT */ /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev * are scaled versions of rtt and mean deviation. * This is designed to be as fast as possible * m stands for "measurement". * * On a 1990 paper the rto value is changed to: * RTO = rtt + 4 * mdev * * Funny. This algorithm seems to be very broken. * These formulae increase RTO, when it should be decreased, increase * too slowly, when it should be increased quickly, decrease too quickly * etc. I guess in BSD RTO takes ONE value, so that it is absolutely * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) */ if (m == 0) m = 1; if (tp->srtt != 0) { m -= (tp->srtt >> 3); /* m is now error in rtt est */ tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */ if (m < 0) { m = -m; /* m is now abs(error) */ m -= (tp->mdev >> 2); /* similar update on mdev */ /* This is similar to one of Eifel findings. * Eifel blocks mdev updates when rtt decreases. * This solution is a bit different: we use finer gain * for mdev in this case (alpha*beta). * Like Eifel it also prevents growth of rto, * but also it limits too fast rto decreases, * happening in pure Eifel. */ if (m > 0) m >>= 3; } else { m -= (tp->mdev >> 2); /* similar update on mdev */ } tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ if (tp->mdev > tp->mdev_max) { tp->mdev_max = tp->mdev; if (tp->mdev_max > tp->rttvar) tp->rttvar = tp->mdev_max; } if (after(tp->snd_una, tp->rtt_seq)) { if (tp->mdev_max < tp->rttvar) tp->rttvar -= (tp->rttvar - tp->mdev_max) >> 2; tp->rtt_seq = tp->snd_nxt; tp->mdev_max = tcp_rto_min(sk); } } else { /* no previous measure. */ tp->srtt = m << 3; /* take the measured time to be rtt */ tp->mdev = m << 1; /* make sure rto = 3*rtt */ tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); tp->rtt_seq = tp->snd_nxt; } } /* Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ static inline void tcp_set_rto(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); /* Old crap is replaced with new one. 8) * * More seriously: * 1. If rtt variance happened to be less 50msec, it is hallucination. * It cannot be less due to utterly erratic ACK generation made * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ * to do with delayed acks, because at cwnd>2 true delack timeout * is invisible. Actually, Linux-2.4 also generates erratic * ACKs in some circumstances. */ inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar; /* 2. Fixups made earlier cannot be right. * If we do not estimate RTO correctly without them, * all the algo is pure shit and should be replaced * with correct one. It is exactly, which we pretend to do. */ } /* NOTE: clamping at TCP_RTO_MIN is not required, current algo * guarantees that rto is higher. */ static inline void tcp_bound_rto(struct sock *sk) { if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX) inet_csk(sk)->icsk_rto = TCP_RTO_MAX; } /* Save metrics learned by this TCP session. This function is called only, when TCP finishes successfully i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE. */ void tcp_update_metrics(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); if (sysctl_tcp_nometrics_save) return; dst_confirm(dst); if (dst && (dst->flags & DST_HOST)) { const struct inet_connection_sock *icsk = inet_csk(sk); int m; if (icsk->icsk_backoff || !tp->srtt) { /* This session failed to estimate rtt. Why? * Probably, no packets returned in time. * Reset our results. */ if (!(dst_metric_locked(dst, RTAX_RTT))) dst->metrics[RTAX_RTT - 1] = 0; return; } m = dst_metric(dst, RTAX_RTT) - tp->srtt; /* If newly calculated rtt larger than stored one, * store new one. Otherwise, use EWMA. Remember, * rtt overestimation is always better than underestimation. */ if (!(dst_metric_locked(dst, RTAX_RTT))) { if (m <= 0) dst->metrics[RTAX_RTT - 1] = tp->srtt; else dst->metrics[RTAX_RTT - 1] -= (m >> 3); } if (!(dst_metric_locked(dst, RTAX_RTTVAR))) { if (m < 0) m = -m; /* Scale deviation to rttvar fixed point */ m >>= 1; if (m < tp->mdev) m = tp->mdev; if (m >= dst_metric(dst, RTAX_RTTVAR)) dst->metrics[RTAX_RTTVAR - 1] = m; else dst->metrics[RTAX_RTTVAR-1] -= (dst_metric(dst, RTAX_RTTVAR) - m)>>2; } if (tp->snd_ssthresh >= 0xFFFF) { /* Slow start still did not finish. */ if (dst_metric(dst, RTAX_SSTHRESH) && !dst_metric_locked(dst, RTAX_SSTHRESH) && (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH)) dst->metrics[RTAX_SSTHRESH-1] = tp->snd_cwnd >> 1; if (!dst_metric_locked(dst, RTAX_CWND) && tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) dst->metrics[RTAX_CWND - 1] = tp->snd_cwnd; } else if (tp->snd_cwnd > tp->snd_ssthresh && icsk->icsk_ca_state == TCP_CA_Open) { /* Cong. avoidance phase, cwnd is reliable. */ if (!dst_metric_locked(dst, RTAX_SSTHRESH)) dst->metrics[RTAX_SSTHRESH-1] = max(tp->snd_cwnd >> 1, tp->snd_ssthresh); if (!dst_metric_locked(dst, RTAX_CWND)) dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_cwnd) >> 1; } else { /* Else slow start did not finish, cwnd is non-sense, ssthresh may be also invalid. */ if (!dst_metric_locked(dst, RTAX_CWND)) dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_ssthresh) >> 1; if (dst_metric(dst, RTAX_SSTHRESH) && !dst_metric_locked(dst, RTAX_SSTHRESH) && tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH)) dst->metrics[RTAX_SSTHRESH-1] = tp->snd_ssthresh; } if (!dst_metric_locked(dst, RTAX_REORDERING)) { if (dst_metric(dst, RTAX_REORDERING) < tp->reordering && tp->reordering != sysctl_tcp_reordering) dst->metrics[RTAX_REORDERING-1] = tp->reordering; } } } /* Numbers are taken from RFC3390. * * John Heffner states: * * The RFC specifies a window of no more than 4380 bytes * unless 2*MSS > 4380. Reading the pseudocode in the RFC * is a bit misleading because they use a clamp at 4380 bytes * rather than use a multiplier in the relevant range. */ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) { __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); if (!cwnd) { if (tp->mss_cache > 1460) cwnd = 2; else cwnd = (tp->mss_cache > 1095) ? 3 : 4; } return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } /* Set slow start threshold and cwnd not falling to slow start */ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) { struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); tp->prior_ssthresh = 0; tp->bytes_acked = 0; if (icsk->icsk_ca_state < TCP_CA_CWR) { tp->undo_marker = 0; if (set_ssthresh) tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1U); tp->snd_cwnd_cnt = 0; tp->high_seq = tp->snd_nxt; tp->snd_cwnd_stamp = tcp_time_stamp; TCP_ECN_queue_cwr(tp); tcp_set_ca_state(sk, TCP_CA_CWR); } } /* * Packet counting of FACK is based on in-order assumptions, therefore TCP * disables it when reordering is detected */ static void tcp_disable_fack(struct tcp_sock *tp) { /* RFC3517 uses different metric in lost marker => reset on change */ if (tcp_is_fack(tp)) tp->lost_skb_hint = NULL; tp->rx_opt.sack_ok &= ~2; } /* Take a notice that peer is sending D-SACKs */ static void tcp_dsack_seen(struct tcp_sock *tp) { tp->rx_opt.sack_ok |= 4; } /* Initialize metrics on socket. */ static void tcp_init_metrics(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); if (dst == NULL) goto reset; dst_confirm(dst); if (dst_metric_locked(dst, RTAX_CWND)) tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND); if (dst_metric(dst, RTAX_SSTHRESH)) { tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH); if (tp->snd_ssthresh > tp->snd_cwnd_clamp) tp->snd_ssthresh = tp->snd_cwnd_clamp; } if (dst_metric(dst, RTAX_REORDERING) && tp->reordering != dst_metric(dst, RTAX_REORDERING)) { tcp_disable_fack(tp); tp->reordering = dst_metric(dst, RTAX_REORDERING); } if (dst_metric(dst, RTAX_RTT) == 0) goto reset; if (!tp->srtt && dst_metric(dst, RTAX_RTT) < (TCP_TIMEOUT_INIT << 3)) goto reset; /* Initial rtt is determined from SYN,SYN-ACK. * The segment is small and rtt may appear much * less than real one. Use per-dst memory * to make it more realistic. * * A bit of theory. RTT is time passed after "normal" sized packet * is sent until it is ACKed. In normal circumstances sending small * packets force peer to delay ACKs and calculation is correct too. * The algorithm is adaptive and, provided we follow specs, it * NEVER underestimate RTT. BUT! If peer tries to make some clever * tricks sort of "quick acks" for time long enough to decrease RTT * to low value, and then abruptly stops to do it and starts to delay * ACKs, wait for troubles. */ if (dst_metric(dst, RTAX_RTT) > tp->srtt) { tp->srtt = dst_metric(dst, RTAX_RTT); tp->rtt_seq = tp->snd_nxt; } if (dst_metric(dst, RTAX_RTTVAR) > tp->mdev) { tp->mdev = dst_metric(dst, RTAX_RTTVAR); tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); } tcp_set_rto(sk); tcp_bound_rto(sk); if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) goto reset; tp->snd_cwnd = tcp_init_cwnd(tp, dst); tp->snd_cwnd_stamp = tcp_time_stamp; return; reset: /* Play conservative. If timestamps are not * supported, TCP will fail to recalculate correct * rtt, if initial rto is too small. FORGET ALL AND RESET! */ if (!tp->rx_opt.saw_tstamp && tp->srtt) { tp->srtt = 0; tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT; inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; } } static void tcp_update_reordering(struct sock *sk, const int metric, const int ts) { struct tcp_sock *tp = tcp_sk(sk); if (metric > tp->reordering) { tp->reordering = min(TCP_MAX_REORDERING, metric); /* This exciting event is worth to be remembered. 8) */ if (ts) NET_INC_STATS_BH(LINUX_MIB_TCPTSREORDER); else if (tcp_is_reno(tp)) NET_INC_STATS_BH(LINUX_MIB_TCPRENOREORDER); else if (tcp_is_fack(tp)) NET_INC_STATS_BH(LINUX_MIB_TCPFACKREORDER); else NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER); #if FASTRETRANS_DEBUG > 1 printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n", tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, tp->reordering, tp->fackets_out, tp->sacked_out, tp->undo_marker ? tp->undo_retrans : 0); #endif tcp_disable_fack(tp); } } /* This procedure tags the retransmission queue when SACKs arrive. * * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L). * Packets in queue with these bits set are counted in variables * sacked_out, retrans_out and lost_out, correspondingly. * * Valid combinations are: * Tag InFlight Description * 0 1 - orig segment is in flight. * S 0 - nothing flies, orig reached receiver. * L 0 - nothing flies, orig lost by net. * R 2 - both orig and retransmit are in flight. * L|R 1 - orig is lost, retransmit is in flight. * S|R 1 - orig reached receiver, retrans is still in flight. * (L|S|R is logically valid, it could occur when L|R is sacked, * but it is equivalent to plain S and code short-curcuits it to S. * L|S is logically invalid, it would mean -1 packet in flight 8)) * * These 6 states form finite state machine, controlled by the following events: * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue()) * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue()) * 3. Loss detection event of one of three flavors: * A. Scoreboard estimator decided the packet is lost. * A'. Reno "three dupacks" marks head of queue lost. * A''. Its FACK modfication, head until snd.fack is lost. * B. SACK arrives sacking data transmitted after never retransmitted * hole was sent out. * C. SACK arrives sacking SND.NXT at the moment, when the * segment was retransmitted. * 4. D-SACK added new rule: D-SACK changes any tag to S. * * It is pleasant to note, that state diagram turns out to be commutative, * so that we are allowed not to be bothered by order of our actions, * when multiple events arrive simultaneously. (see the function below). * * Reordering detection. * -------------------- * Reordering metric is maximal distance, which a packet can be displaced * in packet stream. With SACKs we can estimate it: * * 1. SACK fills old hole and the corresponding segment was not * ever retransmitted -> reordering. Alas, we cannot use it * when segment was retransmitted. * 2. The last flaw is solved with D-SACK. D-SACK arrives * for retransmitted and already SACKed segment -> reordering.. * Both of these heuristics are not used in Loss state, when we cannot * account for retransmits accurately. * * SACK block validation. * ---------------------- * * SACK block range validation checks that the received SACK block fits to * the expected sequence limits, i.e., it is between SND.UNA and SND.NXT. * Note that SND.UNA is not included to the range though being valid because * it means that the receiver is rather inconsistent with itself reporting * SACK reneging when it should advance SND.UNA. Such SACK block this is * perfectly valid, however, in light of RFC2018 which explicitly states * that "SACK block MUST reflect the newest segment. Even if the newest * segment is going to be discarded ...", not that it looks very clever * in case of head skb. Due to potentional receiver driven attacks, we * choose to avoid immediate execution of a walk in write queue due to * reneging and defer head skb's loss recovery to standard loss recovery * procedure that will eventually trigger (nothing forbids us doing this). * * Implements also blockage to start_seq wrap-around. Problem lies in the * fact that though start_seq (s) is before end_seq (i.e., not reversed), * there's no guarantee that it will be before snd_nxt (n). The problem * happens when start_seq resides between end_seq wrap (e_w) and snd_nxt * wrap (s_w): * * <- outs wnd -> <- wrapzone -> * u e n u_w e_w s n_w * | | | | | | | * |<------------+------+----- TCP seqno space --------------+---------->| * ...-- <2^31 ->| |<--------... * ...---- >2^31 ------>| |<--------... * * Current code wouldn't be vulnerable but it's better still to discard such * crazy SACK blocks. Doing this check for start_seq alone closes somewhat * similar case (end_seq after snd_nxt wrap) as earlier reversed check in * snd_nxt wrap -> snd_una region will then become "well defined", i.e., * equal to the ideal case (infinite seqno space without wrap caused issues). * * With D-SACK the lower bound is extended to cover sequence space below * SND.UNA down to undo_marker, which is the last point of interest. Yet * again, D-SACK block must not to go across snd_una (for the same reason as * for the normal SACK blocks, explained above). But there all simplicity * ends, TCP might receive valid D-SACKs below that. As long as they reside * fully below undo_marker they do not affect behavior in anyway and can * therefore be safely ignored. In rare cases (which are more or less * theoretical ones), the D-SACK will nicely cross that boundary due to skb * fragmentation and packet reordering past skb's retransmission. To consider * them correctly, the acceptable range must be extended even more though * the exact amount is rather hard to quantify. However, tp->max_window can * be used as an exaggerated estimate. */ static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack, u32 start_seq, u32 end_seq) { /* Too far in future, or reversed (interpretation is ambiguous) */ if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq)) return 0; /* Nasty start_seq wrap-around check (see comments above) */ if (!before(start_seq, tp->snd_nxt)) return 0; /* In outstanding window? ...This is valid exit for D-SACKs too. * start_seq == snd_una is non-sensical (see comments above) */ if (after(start_seq, tp->snd_una)) return 1; if (!is_dsack || !tp->undo_marker) return 0; /* ...Then it's D-SACK, and must reside below snd_una completely */ if (!after(end_seq, tp->snd_una)) return 0; if (!before(start_seq, tp->undo_marker)) return 1; /* Too old */ if (!after(end_seq, tp->undo_marker)) return 0; /* Undo_marker boundary crossing (overestimates a lot). Known already: * start_seq < undo_marker and end_seq >= undo_marker. */ return !before(start_seq, end_seq - tp->max_window); } /* Check for lost retransmit. This superb idea is borrowed from "ratehalving". * Event "C". Later note: FACK people cheated me again 8), we have to account * for reordering! Ugly, but should help. * * Search retransmitted skbs from write_queue that were sent when snd_nxt was * less than what is now known to be received by the other end (derived from * highest SACK block). Also calculate the lowest snd_nxt among the remaining * retransmitted skbs to avoid some costly processing per ACKs. */ static void tcp_mark_lost_retrans(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int cnt = 0; u32 new_low_seq = tp->snd_nxt; u32 received_upto = tcp_highest_sack_seq(tp); if (!tcp_is_fack(tp) || !tp->retrans_out || !after(received_upto, tp->lost_retrans_low) || icsk->icsk_ca_state != TCP_CA_Recovery) return; tcp_for_write_queue(skb, sk) { u32 ack_seq = TCP_SKB_CB(skb)->ack_seq; if (skb == tcp_send_head(sk)) break; if (cnt == tp->retrans_out) break; if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) continue; if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) continue; if (after(received_upto, ack_seq) && (tcp_is_fack(tp) || !before(received_upto, ack_seq + tp->reordering * tp->mss_cache))) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); /* clear lost hint */ tp->retransmit_skb_hint = NULL; if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { tp->lost_out += tcp_skb_pcount(skb); TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; } NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT); } else { if (before(ack_seq, new_low_seq)) new_low_seq = ack_seq; cnt += tcp_skb_pcount(skb); } } if (tp->retrans_out) tp->lost_retrans_low = new_low_seq; } static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb, struct tcp_sack_block_wire *sp, int num_sacks, u32 prior_snd_una) { u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq); u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq); int dup_sack = 0; if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { dup_sack = 1; tcp_dsack_seen(tp); NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV); } else if (num_sacks > 1) { u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq); u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq); if (!after(end_seq_0, end_seq_1) && !before(start_seq_0, start_seq_1)) { dup_sack = 1; tcp_dsack_seen(tp); NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFORECV); } } /* D-SACK for already forgotten data... Do dumb counting. */ if (dup_sack && !after(end_seq_0, prior_snd_una) && after(end_seq_0, tp->undo_marker)) tp->undo_retrans--; return dup_sack; } /* Check if skb is fully within the SACK block. In presence of GSO skbs, * the incoming SACK may not exactly match but we can find smaller MSS * aligned portion of it that matches. Therefore we might need to fragment * which may fail and creates some hassle (caller must handle error case * returns). */ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, u32 start_seq, u32 end_seq) { int in_sack, err; unsigned int pkt_len; in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && !before(end_seq, TCP_SKB_CB(skb)->end_seq); if (tcp_skb_pcount(skb) > 1 && !in_sack && after(TCP_SKB_CB(skb)->end_seq, start_seq)) { in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq); if (!in_sack) pkt_len = start_seq - TCP_SKB_CB(skb)->seq; else pkt_len = end_seq - TCP_SKB_CB(skb)->seq; err = tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size); if (err < 0) return err; } return in_sack; } static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, int *reord, int dup_sack, int fack_count) { struct tcp_sock *tp = tcp_sk(sk); u8 sacked = TCP_SKB_CB(skb)->sacked; int flag = 0; /* Account D-SACK for retransmitted packet. */ if (dup_sack && (sacked & TCPCB_RETRANS)) { if (after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) tp->undo_retrans--; if (sacked & TCPCB_SACKED_ACKED) *reord = min(fack_count, *reord); } /* Nothing to do; acked frame is about to be dropped (was ACKed). */ if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) return flag; if (!(sacked & TCPCB_SACKED_ACKED)) { if (sacked & TCPCB_SACKED_RETRANS) { /* If the segment is not tagged as lost, * we do not clear RETRANS, believing * that retransmission is still in flight. */ if (sacked & TCPCB_LOST) { TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); tp->lost_out -= tcp_skb_pcount(skb); tp->retrans_out -= tcp_skb_pcount(skb); /* clear lost hint */ tp->retransmit_skb_hint = NULL; } } else { if (!(sacked & TCPCB_RETRANS)) { /* New sack for not retransmitted frame, * which was in hole. It is reordering. */ if (before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) *reord = min(fack_count, *reord); /* SACK enhanced F-RTO (RFC4138; Appendix B) */ if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) flag |= FLAG_ONLY_ORIG_SACKED; } if (sacked & TCPCB_LOST) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; tp->lost_out -= tcp_skb_pcount(skb); /* clear lost hint */ tp->retransmit_skb_hint = NULL; } } TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; flag |= FLAG_DATA_SACKED; tp->sacked_out += tcp_skb_pcount(skb); fack_count += tcp_skb_pcount(skb); /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) && before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq)) tp->lost_cnt_hint += tcp_skb_pcount(skb); if (fack_count > tp->fackets_out) tp->fackets_out = fack_count; if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) tcp_advance_highest_sack(sk, skb); } /* D-SACK. We can detect redundant retransmission in S|R and plain R * frames and clear it. undo_retrans is decreased above, L|R frames * are accounted above as well. */ if (dup_sack && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); tp->retransmit_skb_hint = NULL; } return flag; } static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, struct tcp_sack_block *next_dup, u32 start_seq, u32 end_seq, int dup_sack_in, int *fack_count, int *reord, int *flag) { tcp_for_write_queue_from(skb, sk) { int in_sack = 0; int dup_sack = dup_sack_in; if (skb == tcp_send_head(sk)) break; /* queue is in-order => we can short-circuit the walk early */ if (!before(TCP_SKB_CB(skb)->seq, end_seq)) break; if ((next_dup != NULL) && before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) { in_sack = tcp_match_skb_to_sack(sk, skb, next_dup->start_seq, next_dup->end_seq); if (in_sack > 0) dup_sack = 1; } if (in_sack <= 0) in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, end_seq); if (unlikely(in_sack < 0)) break; if (in_sack) *flag |= tcp_sacktag_one(skb, sk, reord, dup_sack, *fack_count); *fack_count += tcp_skb_pcount(skb); } return skb; } /* Avoid all extra work that is being done by sacktag while walking in * a normal way */ static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, u32 skip_to_seq, int *fack_count) { tcp_for_write_queue_from(skb, sk) { if (skb == tcp_send_head(sk)) break; if (!before(TCP_SKB_CB(skb)->end_seq, skip_to_seq)) break; *fack_count += tcp_skb_pcount(skb); } return skb; } static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, struct sock *sk, struct tcp_sack_block *next_dup, u32 skip_to_seq, int *fack_count, int *reord, int *flag) { if (next_dup == NULL) return skb; if (before(next_dup->start_seq, skip_to_seq)) { skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq, fack_count); skb = tcp_sacktag_walk(skb, sk, NULL, next_dup->start_seq, next_dup->end_seq, 1, fack_count, reord, flag); } return skb; } static int tcp_sack_cache_ok(struct tcp_sock *tp, struct tcp_sack_block *cache) { return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache); } static int tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); unsigned char *ptr = (skb_transport_header(ack_skb) + TCP_SKB_CB(ack_skb)->sacked); struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2); struct tcp_sack_block sp[4]; struct tcp_sack_block *cache; struct sk_buff *skb; int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE) >> 3; int used_sacks; int reord = tp->packets_out; int flag = 0; int found_dup_sack = 0; int fack_count; int i, j; int first_sack_index; if (!tp->sacked_out) { if (WARN_ON(tp->fackets_out)) tp->fackets_out = 0; tcp_highest_sack_reset(sk); } found_dup_sack = tcp_check_dsack(tp, ack_skb, sp_wire, num_sacks, prior_snd_una); if (found_dup_sack) flag |= FLAG_DSACKING_ACK; /* Eliminate too old ACKs, but take into * account more or less fresh ones, they can * contain valid SACK info. */ if (before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una - tp->max_window)) return 0; if (!tp->packets_out) goto out; used_sacks = 0; first_sack_index = 0; for (i = 0; i < num_sacks; i++) { int dup_sack = !i && found_dup_sack; sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq); sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq); if (!tcp_is_sackblock_valid(tp, dup_sack, sp[used_sacks].start_seq, sp[used_sacks].end_seq)) { if (dup_sack) { if (!tp->undo_marker) NET_INC_STATS_BH(LINUX_MIB_TCPDSACKIGNOREDNOUNDO); else NET_INC_STATS_BH(LINUX_MIB_TCPDSACKIGNOREDOLD); } else { /* Don't count olds caused by ACK reordering */ if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) && !after(sp[used_sacks].end_seq, tp->snd_una)) continue; NET_INC_STATS_BH(LINUX_MIB_TCPSACKDISCARD); } if (i == 0) first_sack_index = -1; continue; } /* Ignore very old stuff early */ if (!after(sp[used_sacks].end_seq, prior_snd_una)) continue; used_sacks++; } /* order SACK blocks to allow in order walk of the retrans queue */ for (i = used_sacks - 1; i > 0; i--) { for (j = 0; j < i; j++) { if (after(sp[j].start_seq, sp[j + 1].start_seq)) { struct tcp_sack_block tmp; tmp = sp[j]; sp[j] = sp[j + 1]; sp[j + 1] = tmp; /* Track where the first SACK block goes to */ if (j == first_sack_index) first_sack_index = j + 1; } } } skb = tcp_write_queue_head(sk); fack_count = 0; i = 0; if (!tp->sacked_out) { /* It's already past, so skip checking against it */ cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache); } else { cache = tp->recv_sack_cache; /* Skip empty blocks in at head of the cache */ while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq && !cache->end_seq) cache++; } while (i < used_sacks) { u32 start_seq = sp[i].start_seq; u32 end_seq = sp[i].end_seq; int dup_sack = (found_dup_sack && (i == first_sack_index)); struct tcp_sack_block *next_dup = NULL; if (found_dup_sack && ((i + 1) == first_sack_index)) next_dup = &sp[i + 1]; /* Event "B" in the comment above. */ if (after(end_seq, tp->high_seq)) flag |= FLAG_DATA_LOST; /* Skip too early cached blocks */ while (tcp_sack_cache_ok(tp, cache) && !before(start_seq, cache->end_seq)) cache++; /* Can skip some work by looking recv_sack_cache? */ if (tcp_sack_cache_ok(tp, cache) && !dup_sack && after(end_seq, cache->start_seq)) { /* Head todo? */ if (before(start_seq, cache->start_seq)) { skb = tcp_sacktag_skip(skb, sk, start_seq, &fack_count); skb = tcp_sacktag_walk(skb, sk, next_dup, start_seq, cache->start_seq, dup_sack, &fack_count, &reord, &flag); } /* Rest of the block already fully processed? */ if (!after(end_seq, cache->end_seq)) goto advance_sp; skb = tcp_maybe_skipping_dsack(skb, sk, next_dup, cache->end_seq, &fack_count, &reord, &flag); /* ...tail remains todo... */ if (tcp_highest_sack_seq(tp) == cache->end_seq) { /* ...but better entrypoint exists! */ skb = tcp_highest_sack(sk); if (skb == NULL) break; fack_count = tp->fackets_out; cache++; goto walk; } skb = tcp_sacktag_skip(skb, sk, cache->end_seq, &fack_count); /* Check overlap against next cached too (past this one already) */ cache++; continue; } if (!before(start_seq, tcp_highest_sack_seq(tp))) { skb = tcp_highest_sack(sk); if (skb == NULL) break; fack_count = tp->fackets_out; } skb = tcp_sacktag_skip(skb, sk, start_seq, &fack_count); walk: skb = tcp_sacktag_walk(skb, sk, next_dup, start_seq, end_seq, dup_sack, &fack_count, &reord, &flag); advance_sp: /* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct * due to in-order walk */ if (after(end_seq, tp->frto_highmark)) flag &= ~FLAG_ONLY_ORIG_SACKED; i++; } /* Clear the head of the cache sack blocks so we can skip it next time */ for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) { tp->recv_sack_cache[i].start_seq = 0; tp->recv_sack_cache[i].end_seq = 0; } for (j = 0; j < used_sacks; j++) tp->recv_sack_cache[i++] = sp[j]; tcp_mark_lost_retrans(sk); tcp_verify_left_out(tp); if ((reord < tp->fackets_out) && ((icsk->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker) && (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark))) tcp_update_reordering(sk, tp->fackets_out - reord, 0); out: #if FASTRETRANS_DEBUG > 0 BUG_TRAP((int)tp->sacked_out >= 0); BUG_TRAP((int)tp->lost_out >= 0); BUG_TRAP((int)tp->retrans_out >= 0); BUG_TRAP((int)tcp_packets_in_flight(tp) >= 0); #endif return flag; } /* Limits sacked_out so that sum with lost_out isn't ever larger than * packets_out. Returns zero if sacked_out adjustement wasn't necessary. */ int tcp_limit_reno_sacked(struct tcp_sock *tp) { u32 holes; holes = max(tp->lost_out, 1U); holes = min(holes, tp->packets_out); if ((tp->sacked_out + holes) > tp->packets_out) { tp->sacked_out = tp->packets_out - holes; return 1; } return 0; } /* If we receive more dupacks than we expected counting segments * in assumption of absent reordering, interpret this as reordering. * The only another reason could be bug in receiver TCP. */ static void tcp_check_reno_reordering(struct sock *sk, const int addend) { struct tcp_sock *tp = tcp_sk(sk); if (tcp_limit_reno_sacked(tp)) tcp_update_reordering(sk, tp->packets_out + addend, 0); } /* Emulate SACKs for SACKless connection: account for a new dupack. */ static void tcp_add_reno_sack(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); tp->sacked_out++; tcp_check_reno_reordering(sk, 0); tcp_verify_left_out(tp); } /* Account for ACK, ACKing some data in Reno Recovery phase. */ static void tcp_remove_reno_sacks(struct sock *sk, int acked) { struct tcp_sock *tp = tcp_sk(sk); if (acked > 0) { /* One ACK acked hole. The rest eat duplicate ACKs. */ if (acked - 1 >= tp->sacked_out) tp->sacked_out = 0; else tp->sacked_out -= acked - 1; } tcp_check_reno_reordering(sk, acked); tcp_verify_left_out(tp); } static inline void tcp_reset_reno_sack(struct tcp_sock *tp) { tp->sacked_out = 0; } static int tcp_is_sackfrto(const struct tcp_sock *tp) { return (sysctl_tcp_frto == 0x2) && !tcp_is_reno(tp); } /* F-RTO can only be used if TCP has never retransmitted anything other than * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here) */ int tcp_use_frto(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb; if (!sysctl_tcp_frto) return 0; /* MTU probe and F-RTO won't really play nicely along currently */ if (icsk->icsk_mtup.probe_size) return 0; if (tcp_is_sackfrto(tp)) return 1; /* Avoid expensive walking of rexmit queue if possible */ if (tp->retrans_out > 1) return 0; skb = tcp_write_queue_head(sk); skb = tcp_write_queue_next(sk, skb); /* Skips head */ tcp_for_write_queue_from(skb, sk) { if (skb == tcp_send_head(sk)) break; if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) return 0; /* Short-circuit when first non-SACKed skb has been checked */ if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) break; } return 1; } /* RTO occurred, but do not yet enter Loss state. Instead, defer RTO * recovery a bit and use heuristics in tcp_process_frto() to detect if * the RTO was spurious. Only clear SACKED_RETRANS of the head here to * keep retrans_out counting accurate (with SACK F-RTO, other than head * may still have that bit set); TCPCB_LOST and remaining SACKED_RETRANS * bits are handled if the Loss state is really to be entered (in * tcp_enter_frto_loss). * * Do like tcp_enter_loss() would; when RTO expires the second time it * does: * "Reduce ssthresh if it has not yet been made inside this window." */ void tcp_enter_frto(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; if ((!tp->frto_counter && icsk->icsk_ca_state <= TCP_CA_Disorder) || tp->snd_una == tp->high_seq || ((icsk->icsk_ca_state == TCP_CA_Loss || tp->frto_counter) && !icsk->icsk_retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(sk); /* Our state is too optimistic in ssthresh() call because cwnd * is not reduced until tcp_enter_frto_loss() when previous F-RTO * recovery has not yet completed. Pattern would be this: RTO, * Cumulative ACK, RTO (2xRTO for the same segment does not end * up here twice). * RFC4138 should be more specific on what to do, even though * RTO is quite unlikely to occur after the first Cumulative ACK * due to back-off and complexity of triggering events ... */ if (tp->frto_counter) { u32 stored_cwnd; stored_cwnd = tp->snd_cwnd; tp->snd_cwnd = 2; tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); tp->snd_cwnd = stored_cwnd; } else { tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); } /* ... in theory, cong.control module could do "any tricks" in * ssthresh(), which means that ca_state, lost bits and lost_out * counter would have to be faked before the call occurs. We * consider that too expensive, unlikely and hacky, so modules * using these in ssthresh() must deal these incompatibility * issues if they receives CA_EVENT_FRTO and frto_counter != 0 */ tcp_ca_event(sk, CA_EVENT_FRTO); } tp->undo_marker = tp->snd_una; tp->undo_retrans = 0; skb = tcp_write_queue_head(sk); if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) tp->undo_marker = 0; if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); } tcp_verify_left_out(tp); /* Too bad if TCP was application limited */ tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1); /* Earlier loss recovery underway (see RFC4138; Appendix B). * The last condition is necessary at least in tp->frto_counter case. */ if (tcp_is_sackfrto(tp) && (tp->frto_counter || ((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) && after(tp->high_seq, tp->snd_una)) { tp->frto_highmark = tp->high_seq; } else { tp->frto_highmark = tp->snd_nxt; } tcp_set_ca_state(sk, TCP_CA_Disorder); tp->high_seq = tp->snd_nxt; tp->frto_counter = 1; } /* Enter Loss state after F-RTO was applied. Dupack arrived after RTO, * which indicates that we should follow the traditional RTO recovery, * i.e. mark everything lost and do go-back-N retransmission. */ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; tp->lost_out = 0; tp->retrans_out = 0; if (tcp_is_reno(tp)) tcp_reset_reno_sack(tp); tcp_for_write_queue(skb, sk) { if (skb == tcp_send_head(sk)) break; TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; /* * Count the retransmission made on RTO correctly (only when * waiting for the first ACK and did not get it)... */ if ((tp->frto_counter == 1) && !(flag & FLAG_DATA_ACKED)) { /* For some reason this R-bit might get cleared? */ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) tp->retrans_out += tcp_skb_pcount(skb); /* ...enter this if branch just for the first segment */ flag |= FLAG_DATA_ACKED; } else { if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) tp->undo_marker = 0; TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; } /* Marking forward transmissions that were made after RTO lost * can cause unnecessary retransmissions in some scenarios, * SACK blocks will mitigate that in some but not in all cases. * We used to not mark them but it was causing break-ups with * receivers that do only in-order receival. * * TODO: we could detect presence of such receiver and select * different behavior per flow. */ if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); } } tcp_verify_left_out(tp); tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments; tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; tp->frto_counter = 0; tp->bytes_acked = 0; tp->reordering = min_t(unsigned int, tp->reordering, sysctl_tcp_reordering); tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; TCP_ECN_queue_cwr(tp); tcp_clear_retrans_hints_partial(tp); } static void tcp_clear_retrans_partial(struct tcp_sock *tp) { tp->retrans_out = 0; tp->lost_out = 0; tp->undo_marker = 0; tp->undo_retrans = 0; } void tcp_clear_retrans(struct tcp_sock *tp) { tcp_clear_retrans_partial(tp); tp->fackets_out = 0; tp->sacked_out = 0; } /* Enter Loss state. If "how" is not zero, forget all SACK information * and reset tags completely, otherwise preserve SACKs. If receiver * dropped its ofo queue, we will know this due to reneging detection. */ void tcp_enter_loss(struct sock *sk, int how) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; /* Reduce ssthresh if it has not yet been made inside this window. */ if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(sk); tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); tcp_ca_event(sk, CA_EVENT_LOSS); } tp->snd_cwnd = 1; tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; tp->bytes_acked = 0; tcp_clear_retrans_partial(tp); if (tcp_is_reno(tp)) tcp_reset_reno_sack(tp); if (!how) { /* Push undo marker, if it was plain RTO and nothing * was retransmitted. */ tp->undo_marker = tp->snd_una; tcp_clear_retrans_hints_partial(tp); } else { tp->sacked_out = 0; tp->fackets_out = 0; tcp_clear_all_retrans_hints(tp); } tcp_for_write_queue(skb, sk) { if (skb == tcp_send_head(sk)) break; if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) tp->undo_marker = 0; TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); } } tcp_verify_left_out(tp); tp->reordering = min_t(unsigned int, tp->reordering, sysctl_tcp_reordering); tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; TCP_ECN_queue_cwr(tp); /* Abort F-RTO algorithm if one is in progress */ tp->frto_counter = 0; } /* If ACK arrived pointing to a remembered SACK, it means that our * remembered SACKs do not reflect real state of receiver i.e. * receiver _host_ is heavily congested (or buggy). * * Do processing similar to RTO timeout. */ static int tcp_check_sack_reneging(struct sock *sk, int flag) { if (flag & FLAG_SACK_RENEGING) { struct inet_connection_sock *icsk = inet_csk(sk); NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING); tcp_enter_loss(sk, 1); icsk->icsk_retransmits++; tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); return 1; } return 0; } static inline int tcp_fackets_out(struct tcp_sock *tp) { return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out; } /* Heurestics to calculate number of duplicate ACKs. There's no dupACKs * counter when SACK is enabled (without SACK, sacked_out is used for * that purpose). * * Instead, with FACK TCP uses fackets_out that includes both SACKed * segments up to the highest received SACK block so far and holes in * between them. * * With reordering, holes may still be in flight, so RFC3517 recovery * uses pure sacked_out (total number of SACKed segments) even though * it violates the RFC that uses duplicate ACKs, often these are equal * but when e.g. out-of-window ACKs or packet duplication occurs, * they differ. Since neither occurs due to loss, TCP should really * ignore them. */ static inline int tcp_dupack_heurestics(struct tcp_sock *tp) { return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; } static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) { return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto); } static inline int tcp_head_timedout(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); return tp->packets_out && tcp_skb_timedout(sk, tcp_write_queue_head(sk)); } /* Linux NewReno/SACK/FACK/ECN state machine. * -------------------------------------- * * "Open" Normal state, no dubious events, fast path. * "Disorder" In all the respects it is "Open", * but requires a bit more attention. It is entered when * we see some SACKs or dupacks. It is split of "Open" * mainly to move some processing from fast path to slow one. * "CWR" CWND was reduced due to some Congestion Notification event. * It can be ECN, ICMP source quench, local device congestion. * "Recovery" CWND was reduced, we are fast-retransmitting. * "Loss" CWND was reduced due to RTO timeout or SACK reneging. * * tcp_fastretrans_alert() is entered: * - each incoming ACK, if state is not "Open" * - when arrived ACK is unusual, namely: * * SACK * * Duplicate ACK. * * ECN ECE. * * Counting packets in flight is pretty simple. * * in_flight = packets_out - left_out + retrans_out * * packets_out is SND.NXT-SND.UNA counted in packets. * * retrans_out is number of retransmitted segments. * * left_out is number of segments left network, but not ACKed yet. * * left_out = sacked_out + lost_out * * sacked_out: Packets, which arrived to receiver out of order * and hence not ACKed. With SACKs this number is simply * amount of SACKed data. Even without SACKs * it is easy to give pretty reliable estimate of this number, * counting duplicate ACKs. * * lost_out: Packets lost by network. TCP has no explicit * "loss notification" feedback from network (for now). * It means that this number can be only _guessed_. * Actually, it is the heuristics to predict lossage that * distinguishes different algorithms. * * F.e. after RTO, when all the queue is considered as lost, * lost_out = packets_out and in_flight = retrans_out. * * Essentially, we have now two algorithms counting * lost packets. * * FACK: It is the simplest heuristics. As soon as we decided * that something is lost, we decide that _all_ not SACKed * packets until the most forward SACK are lost. I.e. * lost_out = fackets_out - sacked_out and left_out = fackets_out. * It is absolutely correct estimate, if network does not reorder * packets. And it loses any connection to reality when reordering * takes place. We use FACK by default until reordering * is suspected on the path to this destination. * * NewReno: when Recovery is entered, we assume that one segment * is lost (classic Reno). While we are in Recovery and * a partial ACK arrives, we assume that one more packet * is lost (NewReno). This heuristics are the same in NewReno * and SACK. * * Imagine, that's all! Forget about all this shamanism about CWND inflation * deflation etc. CWND is real congestion window, never inflated, changes * only according to classic VJ rules. * * Really tricky (and requiring careful tuning) part of algorithm * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue(). * The first determines the moment _when_ we should reduce CWND and, * hence, slow down forward transmission. In fact, it determines the moment * when we decide that hole is caused by loss, rather than by a reorder. * * tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill * holes, caused by lost packets. * * And the most logically complicated part of algorithm is undo * heuristics. We detect false retransmits due to both too early * fast retransmit (reordering) and underestimated RTO, analyzing * timestamps and D-SACKs. When we detect that some segments were * retransmitted by mistake and CWND reduction was wrong, we undo * window reduction and abort recovery phase. This logic is hidden * inside several functions named tcp_try_undo_<something>. */ /* This function decides, when we should leave Disordered state * and enter Recovery phase, reducing congestion window. * * Main question: may we further continue forward transmission * with the same cwnd? */ static int tcp_time_to_recover(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); __u32 packets_out; /* Do not perform any recovery during F-RTO algorithm */ if (tp->frto_counter) return 0; /* Trick#1: The loss is proven. */ if (tp->lost_out) return 1; /* Not-A-Trick#2 : Classic rule... */ if (tcp_dupack_heurestics(tp) > tp->reordering) return 1; /* Trick#3 : when we use RFC2988 timer restart, fast * retransmit can be triggered by timeout of queue head. */ if (tcp_is_fack(tp) && tcp_head_timedout(sk)) return 1; /* Trick#4: It is still not OK... But will it be useful to delay * recovery more? */ packets_out = tp->packets_out; if (packets_out <= tp->reordering && tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) && !tcp_may_send_now(sk)) { /* We have nothing to send. This connection is limited * either by receiver window or by application. */ return 1; } return 0; } /* RFC: This is from the original, I doubt that this is necessary at all: * clear xmit_retrans hint if seq of this skb is beyond hint. How could we * retransmitted past LOST markings in the first place? I'm not fully sure * about undo and end of connection cases, which can cause R without L? */ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) { if ((tp->retransmit_skb_hint != NULL) && before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) tp->retransmit_skb_hint = NULL; } /* Mark head of queue up as lost. With RFC3517 SACK, the packets is * is against sacked "cnt", otherwise it's against facked "cnt" */ static void tcp_mark_head_lost(struct sock *sk, int packets) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int cnt, oldcnt; int err; unsigned int mss; BUG_TRAP(packets <= tp->packets_out); if (tp->lost_skb_hint) { skb = tp->lost_skb_hint; cnt = tp->lost_cnt_hint; } else { skb = tcp_write_queue_head(sk); cnt = 0; } tcp_for_write_queue_from(skb, sk) { if (skb == tcp_send_head(sk)) break; /* TODO: do this better */ /* this is not the most efficient way to do this... */ tp->lost_skb_hint = skb; tp->lost_cnt_hint = cnt; if (after(TCP_SKB_CB(skb)->end_seq, tp->high_seq)) break; oldcnt = cnt; if (tcp_is_fack(tp) || tcp_is_reno(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) cnt += tcp_skb_pcount(skb); if (cnt > packets) { if (tcp_is_sack(tp) || (oldcnt >= packets)) break; mss = skb_shinfo(skb)->gso_size; err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss); if (err < 0) break; cnt = packets; } if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_SACKED_ACKED|TCPCB_LOST))) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); tcp_verify_retransmit_hint(tp, skb); } } tcp_verify_left_out(tp); } /* Account newly detected lost packet(s) */ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) { struct tcp_sock *tp = tcp_sk(sk); if (tcp_is_reno(tp)) { tcp_mark_head_lost(sk, 1); } else if (tcp_is_fack(tp)) { int lost = tp->fackets_out - tp->reordering; if (lost <= 0) lost = 1; tcp_mark_head_lost(sk, lost); } else { int sacked_upto = tp->sacked_out - tp->reordering; if (sacked_upto < fast_rexmit) sacked_upto = fast_rexmit; tcp_mark_head_lost(sk, sacked_upto); } /* New heuristics: it is possible only after we switched * to restart timer each time when something is ACKed. * Hence, we can detect timed out packets during fast * retransmit without falling to slow start. */ if (tcp_is_fack(tp) && tcp_head_timedout(sk)) { struct sk_buff *skb; skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint : tcp_write_queue_head(sk); tcp_for_write_queue_from(skb, sk) { if (skb == tcp_send_head(sk)) break; if (!tcp_skb_timedout(sk, skb)) break; if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_SACKED_ACKED|TCPCB_LOST))) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); tcp_verify_retransmit_hint(tp, skb); } } tp->scoreboard_skb_hint = skb; tcp_verify_left_out(tp); } } /* CWND moderation, preventing bursts due to too big ACKs * in dubious situations. */ static inline void tcp_moderate_cwnd(struct tcp_sock *tp) { tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + tcp_max_burst(tp)); tp->snd_cwnd_stamp = tcp_time_stamp; } /* Lower bound on congestion window is slow start threshold * unless congestion avoidance choice decides to overide it. */ static inline u32 tcp_cwnd_min(const struct sock *sk) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh; } /* Decrease cwnd each second ack. */ static void tcp_cwnd_down(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); int decr = tp->snd_cwnd_cnt + 1; if ((flag & (FLAG_ANY_PROGRESS | FLAG_DSACKING_ACK)) || (tcp_is_reno(tp) && !(flag & FLAG_NOT_DUP))) { tp->snd_cwnd_cnt = decr & 1; decr >>= 1; if (decr && tp->snd_cwnd > tcp_cwnd_min(sk)) tp->snd_cwnd -= decr; tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1); tp->snd_cwnd_stamp = tcp_time_stamp; } } /* Nothing was retransmitted or returned timestamp is less * than timestamp of the first retransmission. */ static inline int tcp_packet_delayed(struct tcp_sock *tp) { return !tp->retrans_stamp || (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && before(tp->rx_opt.rcv_tsecr, tp->retrans_stamp)); } /* Undo procedures. */ #if FASTRETRANS_DEBUG > 1 static void DBGUNDO(struct sock *sk, const char *msg) { struct tcp_sock *tp = tcp_sk(sk); struct inet_sock *inet = inet_sk(sk); if (sk->sk_family == AF_INET) { printk(KERN_DEBUG "Undo %s " NIPQUAD_FMT "/%u c%u l%u ss%u/%u p%u\n", msg, NIPQUAD(inet->daddr), ntohs(inet->dport), tp->snd_cwnd, tcp_left_out(tp), tp->snd_ssthresh, tp->prior_ssthresh, tp->packets_out); } #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) else if (sk->sk_family == AF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); printk(KERN_DEBUG "Undo %s " NIP6_FMT "/%u c%u l%u ss%u/%u p%u\n", msg, NIP6(np->daddr), ntohs(inet->dport), tp->snd_cwnd, tcp_left_out(tp), tp->snd_ssthresh, tp->prior_ssthresh, tp->packets_out); } #endif } #else #define DBGUNDO(x...) do { } while (0) #endif static void tcp_undo_cwr(struct sock *sk, const int undo) { struct tcp_sock *tp = tcp_sk(sk); if (tp->prior_ssthresh) { const struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ca_ops->undo_cwnd) tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk); else tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1); if (undo && tp->prior_ssthresh > tp->snd_ssthresh) { tp->snd_ssthresh = tp->prior_ssthresh; TCP_ECN_withdraw_cwr(tp); } } else { tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); } tcp_moderate_cwnd(tp); tp->snd_cwnd_stamp = tcp_time_stamp; /* There is something screwy going on with the retrans hints after an undo */ tcp_clear_all_retrans_hints(tp); } static inline int tcp_may_undo(struct tcp_sock *tp) { return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp)); } /* People celebrate: "We love our President!" */ static int tcp_try_undo_recovery(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (tcp_may_undo(tp)) { /* Happy end! We did not retransmit anything * or our original transmission succeeded. */ DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); tcp_undo_cwr(sk, 1); if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); else NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO); tp->undo_marker = 0; } if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { /* Hold old state until something *above* high_seq * is ACKed. For Reno it is MUST to prevent false * fast retransmits (RFC2582). SACK TCP is safe. */ tcp_moderate_cwnd(tp); return 1; } tcp_set_ca_state(sk, TCP_CA_Open); return 0; } /* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */ static void tcp_try_undo_dsack(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (tp->undo_marker && !tp->undo_retrans) { DBGUNDO(sk, "D-SACK"); tcp_undo_cwr(sk, 1); tp->undo_marker = 0; NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO); } } /* Undo during fast recovery after partial ACK. */ static int tcp_try_undo_partial(struct sock *sk, int acked) { struct tcp_sock *tp = tcp_sk(sk); /* Partial ACK arrived. Force Hoe's retransmit. */ int failed = tcp_is_reno(tp) || (tcp_fackets_out(tp) > tp->reordering); if (tcp_may_undo(tp)) { /* Plain luck! Hole if filled with delayed * packet, rather than with a retransmit. */ if (tp->retrans_out == 0) tp->retrans_stamp = 0; tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); DBGUNDO(sk, "Hoe"); tcp_undo_cwr(sk, 0); NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO); /* So... Do not make Hoe's retransmit yet. * If the first packet was delayed, the rest * ones are most probably delayed as well. */ failed = 0; } return failed; } /* Undo during loss recovery after partial ACK. */ static int tcp_try_undo_loss(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (tcp_may_undo(tp)) { struct sk_buff *skb; tcp_for_write_queue(skb, sk) { if (skb == tcp_send_head(sk)) break; TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; } tcp_clear_all_retrans_hints(tp); DBGUNDO(sk, "partial loss"); tp->lost_out = 0; tcp_undo_cwr(sk, 1); NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); inet_csk(sk)->icsk_retransmits = 0; tp->undo_marker = 0; if (tcp_is_sack(tp)) tcp_set_ca_state(sk, TCP_CA_Open); return 1; } return 0; } static inline void tcp_complete_cwr(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); tp->snd_cwnd_stamp = tcp_time_stamp; tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); } static void tcp_try_keep_open(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); int state = TCP_CA_Open; if (tcp_left_out(tp) || tp->retrans_out || tp->undo_marker) state = TCP_CA_Disorder; if (inet_csk(sk)->icsk_ca_state != state) { tcp_set_ca_state(sk, state); tp->high_seq = tp->snd_nxt; } } static void tcp_try_to_open(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); tcp_verify_left_out(tp); if (!tp->frto_counter && tp->retrans_out == 0) tp->retrans_stamp = 0; if (flag & FLAG_ECE) tcp_enter_cwr(sk, 1); if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { tcp_try_keep_open(sk); tcp_moderate_cwnd(tp); } else { tcp_cwnd_down(sk, flag); } } static void tcp_mtup_probe_failed(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1; icsk->icsk_mtup.probe_size = 0; } static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); /* FIXME: breaks with very large cwnd */ tp->prior_ssthresh = tcp_current_ssthresh(sk); tp->snd_cwnd = tp->snd_cwnd * tcp_mss_to_mtu(sk, tp->mss_cache) / icsk->icsk_mtup.probe_size; tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; tp->rcv_ssthresh = tcp_current_ssthresh(sk); icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size; icsk->icsk_mtup.probe_size = 0; tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and * packets lost by network. * * Besides that it does CWND reduction, when packet loss is detected * and changes state of machine. * * It does _not_ decide what to send, it is made in function * tcp_xmit_retransmit_queue(). */ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && (tcp_fackets_out(tp) > tp->reordering)); int fast_rexmit = 0; if (WARN_ON(!tp->packets_out && tp->sacked_out)) tp->sacked_out = 0; if (WARN_ON(!tp->sacked_out && tp->fackets_out)) tp->fackets_out = 0; /* Now state machine starts. * A. ECE, hence prohibit cwnd undoing, the reduction is required. */ if (flag & FLAG_ECE) tp->prior_ssthresh = 0; /* B. In all the states check for reneging SACKs. */ if (tcp_check_sack_reneging(sk, flag)) return; /* C. Process data loss notification, provided it is valid. */ if (tcp_is_fack(tp) && (flag & FLAG_DATA_LOST) && before(tp->snd_una, tp->high_seq) && icsk->icsk_ca_state != TCP_CA_Open && tp->fackets_out > tp->reordering) { tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering); NET_INC_STATS_BH(LINUX_MIB_TCPLOSS); } /* D. Check consistency of the current state. */ tcp_verify_left_out(tp); /* E. Check state exit conditions. State can be terminated * when high_seq is ACKed. */ if (icsk->icsk_ca_state == TCP_CA_Open) { BUG_TRAP(tp->retrans_out == 0); tp->retrans_stamp = 0; } else if (!before(tp->snd_una, tp->high_seq)) { switch (icsk->icsk_ca_state) { case TCP_CA_Loss: icsk->icsk_retransmits = 0; if (tcp_try_undo_recovery(sk)) return; break; case TCP_CA_CWR: /* CWR is to be held something *above* high_seq * is ACKed for CWR bit to reach receiver. */ if (tp->snd_una != tp->high_seq) { tcp_complete_cwr(sk); tcp_set_ca_state(sk, TCP_CA_Open); } break; case TCP_CA_Disorder: tcp_try_undo_dsack(sk); if (!tp->undo_marker || /* For SACK case do not Open to allow to undo * catching for all duplicate ACKs. */ tcp_is_reno(tp) || tp->snd_una != tp->high_seq) { tp->undo_marker = 0; tcp_set_ca_state(sk, TCP_CA_Open); } break; case TCP_CA_Recovery: if (tcp_is_reno(tp)) tcp_reset_reno_sack(tp); if (tcp_try_undo_recovery(sk)) return; tcp_complete_cwr(sk); break; } } /* F. Process state. */ switch (icsk->icsk_ca_state) { case TCP_CA_Recovery: if (!(flag & FLAG_SND_UNA_ADVANCED)) { if (tcp_is_reno(tp) && is_dupack) tcp_add_reno_sack(sk); } else do_lost = tcp_try_undo_partial(sk, pkts_acked); break; case TCP_CA_Loss: if (flag & FLAG_DATA_ACKED) icsk->icsk_retransmits = 0; if (tcp_is_reno(tp) && flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); if (!tcp_try_undo_loss(sk)) { tcp_moderate_cwnd(tp); tcp_xmit_retransmit_queue(sk); return; } if (icsk->icsk_ca_state != TCP_CA_Open) return; /* Loss is undone; fall through to processing in Open state. */ default: if (tcp_is_reno(tp)) { if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); if (is_dupack) tcp_add_reno_sack(sk); } if (icsk->icsk_ca_state == TCP_CA_Disorder) tcp_try_undo_dsack(sk); if (!tcp_time_to_recover(sk)) { tcp_try_to_open(sk, flag); return; } /* MTU probe failure: don't reduce cwnd */ if (icsk->icsk_ca_state < TCP_CA_CWR && icsk->icsk_mtup.probe_size && tp->snd_una == tp->mtu_probe.probe_seq_start) { tcp_mtup_probe_failed(sk); /* Restores the reduction we did in tcp_mtup_probe() */ tp->snd_cwnd++; tcp_simple_retransmit(sk); return; } /* Otherwise enter Recovery state */ if (tcp_is_reno(tp)) NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERY); else NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERY); tp->high_seq = tp->snd_nxt; tp->prior_ssthresh = 0; tp->undo_marker = tp->snd_una; tp->undo_retrans = tp->retrans_out; if (icsk->icsk_ca_state < TCP_CA_CWR) { if (!(flag & FLAG_ECE)) tp->prior_ssthresh = tcp_current_ssthresh(sk); tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); TCP_ECN_queue_cwr(tp); } tp->bytes_acked = 0; tp->snd_cwnd_cnt = 0; tcp_set_ca_state(sk, TCP_CA_Recovery); fast_rexmit = 1; } if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk))) tcp_update_scoreboard(sk, fast_rexmit); tcp_cwnd_down(sk, flag); tcp_xmit_retransmit_queue(sk); } /* Read draft-ietf-tcplw-high-performance before mucking * with this code. (Supersedes RFC1323) */ static void tcp_ack_saw_tstamp(struct sock *sk, int flag) { /* RTTM Rule: A TSecr value received in a segment is used to * update the averaged RTT measurement only if the segment * acknowledges some new data, i.e., only if it advances the * left edge of the send window. * * See draft-ietf-tcplw-high-performance-00, section 3.3. * 1998/04/10 Andrey V. Savochkin <saw@msu.ru> * * Changed: reset backoff as soon as we see the first valid sample. * If we do not, we get strongly overestimated rto. With timestamps * samples are accepted even from very old segments: f.e., when rtt=1 * increases to 8, we retransmit 5 times and after 8 seconds delayed * answer arrives rto becomes 120 seconds! If at least one of segments * in window is lost... Voila. --ANK (010210) */ struct tcp_sock *tp = tcp_sk(sk); const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; tcp_rtt_estimator(sk, seq_rtt); tcp_set_rto(sk); inet_csk(sk)->icsk_backoff = 0; tcp_bound_rto(sk); } static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag) { /* We don't have a timestamp. Can only use * packets that are not retransmitted to determine * rtt estimates. Also, we must not reset the * backoff for rto until we get a non-retransmitted * packet. This allows us to deal with a situation * where the network delay has increased suddenly. * I.e. Karn's algorithm. (SIGCOMM '87, p5.) */ if (flag & FLAG_RETRANS_DATA_ACKED) return; tcp_rtt_estimator(sk, seq_rtt); tcp_set_rto(sk); inet_csk(sk)->icsk_backoff = 0; tcp_bound_rto(sk); } static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, const s32 seq_rtt) { const struct tcp_sock *tp = tcp_sk(sk); /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) tcp_ack_saw_tstamp(sk, flag); else if (seq_rtt >= 0) tcp_ack_no_tstamp(sk, seq_rtt, flag); } static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { const struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_ca_ops->cong_avoid(sk, ack, in_flight); tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp; } /* Restart timer after forward progress on connection. * RFC2988 recommends to restart timer to now+rto. */ static void tcp_rearm_rto(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (!tp->packets_out) { inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); } else { inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); } } /* If we get here, the whole TSO packet has not been acked. */ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); u32 packets_acked; BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)); packets_acked = tcp_skb_pcount(skb); if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) return 0; packets_acked -= tcp_skb_pcount(skb); if (packets_acked) { BUG_ON(tcp_skb_pcount(skb) == 0); BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)); } return packets_acked; } /* Remove acknowledged frames from the retransmission queue. If our packet * is before the ack sequence we can discard it as it's confirmed to have * arrived at the other end. */ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets) { struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb; u32 now = tcp_time_stamp; int fully_acked = 1; int flag = 0; u32 pkts_acked = 0; u32 reord = tp->packets_out; s32 seq_rtt = -1; s32 ca_seq_rtt = -1; ktime_t last_ackt = net_invalid_timestamp(); while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); u32 end_seq; u32 acked_pcount; u8 sacked = scb->sacked; /* Determine how many packets and what bytes were acked, tso and else */ if (after(scb->end_seq, tp->snd_una)) { if (tcp_skb_pcount(skb) == 1 || !after(tp->snd_una, scb->seq)) break; acked_pcount = tcp_tso_acked(sk, skb); if (!acked_pcount) break; fully_acked = 0; end_seq = tp->snd_una; } else { acked_pcount = tcp_skb_pcount(skb); end_seq = scb->end_seq; } /* MTU probing checks */ if (fully_acked && icsk->icsk_mtup.probe_size && !after(tp->mtu_probe.probe_seq_end, scb->end_seq)) { tcp_mtup_probe_success(sk, skb); } if (sacked & TCPCB_RETRANS) { if (sacked & TCPCB_SACKED_RETRANS) tp->retrans_out -= acked_pcount; flag |= FLAG_RETRANS_DATA_ACKED; ca_seq_rtt = -1; seq_rtt = -1; if ((flag & FLAG_DATA_ACKED) || (acked_pcount > 1)) flag |= FLAG_NONHEAD_RETRANS_ACKED; } else { ca_seq_rtt = now - scb->when; last_ackt = skb->tstamp; if (seq_rtt < 0) { seq_rtt = ca_seq_rtt; } if (!(sacked & TCPCB_SACKED_ACKED)) reord = min(pkts_acked, reord); } if (sacked & TCPCB_SACKED_ACKED) tp->sacked_out -= acked_pcount; if (sacked & TCPCB_LOST) tp->lost_out -= acked_pcount; if (unlikely(tp->urg_mode && !before(end_seq, tp->snd_up))) tp->urg_mode = 0; tp->packets_out -= acked_pcount; pkts_acked += acked_pcount; /* Initial outgoing SYN's get put onto the write_queue * just like anything else we transmit. It is not * true data, and if we misinform our callers that * this ACK acks real data, we will erroneously exit * connection startup slow start one packet too * quickly. This is severely frowned upon behavior. */ if (!(scb->flags & TCPCB_FLAG_SYN)) { flag |= FLAG_DATA_ACKED; } else { flag |= FLAG_SYN_ACKED; tp->retrans_stamp = 0; } if (!fully_acked) break; tcp_unlink_write_queue(skb, sk); sk_wmem_free_skb(sk, skb); tcp_clear_all_retrans_hints(tp); } if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) flag |= FLAG_SACK_RENEGING; if (flag & FLAG_ACKED) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; tcp_ack_update_rtt(sk, flag, seq_rtt); tcp_rearm_rto(sk); if (tcp_is_reno(tp)) { tcp_remove_reno_sacks(sk, pkts_acked); } else { /* Non-retransmitted hole got filled? That's reordering */ if (reord < prior_fackets) tcp_update_reordering(sk, tp->fackets_out - reord, 0); } tp->fackets_out -= min(pkts_acked, tp->fackets_out); if (ca_ops->pkts_acked) { s32 rtt_us = -1; /* Is the ACK triggering packet unambiguous? */ if (!(flag & FLAG_RETRANS_DATA_ACKED)) { /* High resolution needed and available? */ if (ca_ops->flags & TCP_CONG_RTT_STAMP && !ktime_equal(last_ackt, net_invalid_timestamp())) rtt_us = ktime_us_delta(ktime_get_real(), last_ackt); else if (ca_seq_rtt > 0) rtt_us = jiffies_to_usecs(ca_seq_rtt); } ca_ops->pkts_acked(sk, pkts_acked, rtt_us); } } #if FASTRETRANS_DEBUG > 0 BUG_TRAP((int)tp->sacked_out >= 0); BUG_TRAP((int)tp->lost_out >= 0); BUG_TRAP((int)tp->retrans_out >= 0); if (!tp->packets_out && tcp_is_sack(tp)) { icsk = inet_csk(sk); if (tp->lost_out) { printk(KERN_DEBUG "Leak l=%u %d\n", tp->lost_out, icsk->icsk_ca_state); tp->lost_out = 0; } if (tp->sacked_out) { printk(KERN_DEBUG "Leak s=%u %d\n", tp->sacked_out, icsk->icsk_ca_state); tp->sacked_out = 0; } if (tp->retrans_out) { printk(KERN_DEBUG "Leak r=%u %d\n", tp->retrans_out, icsk->icsk_ca_state); tp->retrans_out = 0; } } #endif return flag; } static void tcp_ack_probe(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); /* Was it a usable window open? */ if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) { icsk->icsk_backoff = 0; inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); /* Socket must be waked up by subsequent tcp_data_snd_check(). * This function is not for random using! */ } else { inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), TCP_RTO_MAX); } } static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag) { return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open); } static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag) { const struct tcp_sock *tp = tcp_sk(sk); return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && !((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR)); } /* Check that window update is acceptable. * The function assumes that snd_una<=ack<=snd_next. */ static inline int tcp_may_update_window(const struct tcp_sock *tp, const u32 ack, const u32 ack_seq, const u32 nwin) { return (after(ack, tp->snd_una) || after(ack_seq, tp->snd_wl1) || (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd)); } /* Update our send window. * * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2 * and in FreeBSD. NetBSD's one is even worse.) is wrong. */ static int tcp_ack_update_window(struct sock *sk, struct sk_buff *skb, u32 ack, u32 ack_seq) { struct tcp_sock *tp = tcp_sk(sk); int flag = 0; u32 nwin = ntohs(tcp_hdr(skb)->window); if (likely(!tcp_hdr(skb)->syn)) nwin <<= tp->rx_opt.snd_wscale; if (tcp_may_update_window(tp, ack, ack_seq, nwin)) { flag |= FLAG_WIN_UPDATE; tcp_update_wl(tp, ack, ack_seq); if (tp->snd_wnd != nwin) { tp->snd_wnd = nwin; /* Note, it is the only place, where * fast path is recovered for sending TCP. */ tp->pred_flags = 0; tcp_fast_path_check(sk); if (nwin > tp->max_window) { tp->max_window = nwin; tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie); } } } tp->snd_una = ack; return flag; } /* A very conservative spurious RTO response algorithm: reduce cwnd and * continue in congestion avoidance. */ static void tcp_conservative_spur_to_response(struct tcp_sock *tp) { tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); tp->snd_cwnd_cnt = 0; tp->bytes_acked = 0; TCP_ECN_queue_cwr(tp); tcp_moderate_cwnd(tp); } /* A conservative spurious RTO response algorithm: reduce cwnd using * rate halving and continue in congestion avoidance. */ static void tcp_ratehalving_spur_to_response(struct sock *sk) { tcp_enter_cwr(sk, 0); } static void tcp_undo_spur_to_response(struct sock *sk, int flag) { if (flag & FLAG_ECE) tcp_ratehalving_spur_to_response(sk); else tcp_undo_cwr(sk, 1); } /* F-RTO spurious RTO detection algorithm (RFC4138) * * F-RTO affects during two new ACKs following RTO (well, almost, see inline * comments). State (ACK number) is kept in frto_counter. When ACK advances * window (but not to or beyond highest sequence sent before RTO): * On First ACK, send two new segments out. * On Second ACK, RTO was likely spurious. Do spurious response (response * algorithm is not part of the F-RTO detection algorithm * given in RFC4138 but can be selected separately). * Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss * and TCP falls back to conventional RTO recovery. F-RTO allows overriding * of Nagle, this is done using frto_counter states 2 and 3, when a new data * segment of any size sent during F-RTO, state 2 is upgraded to 3. * * Rationale: if the RTO was spurious, new ACKs should arrive from the * original window even after we transmit two new data segments. * * SACK version: * on first step, wait until first cumulative ACK arrives, then move to * the second step. In second step, the next ACK decides. * * F-RTO is implemented (mainly) in four functions: * - tcp_use_frto() is used to determine if TCP is can use F-RTO * - tcp_enter_frto() prepares TCP state on RTO if F-RTO is used, it is * called when tcp_use_frto() showed green light * - tcp_process_frto() handles incoming ACKs during F-RTO algorithm * - tcp_enter_frto_loss() is called if there is not enough evidence * to prove that the RTO is indeed spurious. It transfers the control * from F-RTO to the conventional RTO recovery */ static int tcp_process_frto(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); tcp_verify_left_out(tp); /* Duplicate the behavior from Loss state (fastretrans_alert) */ if (flag & FLAG_DATA_ACKED) inet_csk(sk)->icsk_retransmits = 0; if ((flag & FLAG_NONHEAD_RETRANS_ACKED) || ((tp->frto_counter >= 2) && (flag & FLAG_RETRANS_DATA_ACKED))) tp->undo_marker = 0; if (!before(tp->snd_una, tp->frto_highmark)) { tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag); return 1; } if (!tcp_is_sackfrto(tp)) { /* RFC4138 shortcoming in step 2; should also have case c): * ACK isn't duplicate nor advances window, e.g., opposite dir * data, winupdate */ if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP)) return 1; if (!(flag & FLAG_DATA_ACKED)) { tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3), flag); return 1; } } else { if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) { /* Prevent sending of new data. */ tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)); return 1; } if ((tp->frto_counter >= 2) && (!(flag & FLAG_FORWARD_PROGRESS) || ((flag & FLAG_DATA_SACKED) && !(flag & FLAG_ONLY_ORIG_SACKED)))) { /* RFC4138 shortcoming (see comment above) */ if (!(flag & FLAG_FORWARD_PROGRESS) && (flag & FLAG_NOT_DUP)) return 1; tcp_enter_frto_loss(sk, 3, flag); return 1; } } if (tp->frto_counter == 1) { /* tcp_may_send_now needs to see updated state */ tp->snd_cwnd = tcp_packets_in_flight(tp) + 2; tp->frto_counter = 2; if (!tcp_may_send_now(sk)) tcp_enter_frto_loss(sk, 2, flag); return 1; } else { switch (sysctl_tcp_frto_response) { case 2: tcp_undo_spur_to_response(sk, flag); break; case 1: tcp_conservative_spur_to_response(tp); break; default: tcp_ratehalving_spur_to_response(sk); break; } tp->frto_counter = 0; tp->undo_marker = 0; NET_INC_STATS_BH(LINUX_MIB_TCPSPURIOUSRTOS); } return 0; } /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); u32 prior_snd_una = tp->snd_una; u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; u32 prior_in_flight; u32 prior_fackets; int prior_packets; int frto_cwnd = 0; /* If the ack is newer than sent or older than previous acks * then we can probably ignore it. */ if (after(ack, tp->snd_nxt)) goto uninteresting_ack; if (before(ack, prior_snd_una)) goto old_ack; if (after(ack, prior_snd_una)) flag |= FLAG_SND_UNA_ADVANCED; if (sysctl_tcp_abc) { if (icsk->icsk_ca_state < TCP_CA_CWR) tp->bytes_acked += ack - prior_snd_una; else if (icsk->icsk_ca_state == TCP_CA_Loss) /* we assume just one segment left network */ tp->bytes_acked += min(ack - prior_snd_una, tp->mss_cache); } prior_fackets = tp->fackets_out; prior_in_flight = tcp_packets_in_flight(tp); if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) { /* Window is constant, pure forward advance. * No more checks are required. * Note, we use the fact that SND.UNA>=SND.WL2. */ tcp_update_wl(tp, ack, ack_seq); tp->snd_una = ack; flag |= FLAG_WIN_UPDATE; tcp_ca_event(sk, CA_EVENT_FAST_ACK); NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); } else { if (ack_seq != TCP_SKB_CB(skb)->end_seq) flag |= FLAG_DATA; else NET_INC_STATS_BH(LINUX_MIB_TCPPUREACKS); flag |= tcp_ack_update_window(sk, skb, ack, ack_seq); if (TCP_SKB_CB(skb)->sacked) flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) flag |= FLAG_ECE; tcp_ca_event(sk, CA_EVENT_SLOW_ACK); } /* We passed data and got it acked, remove any soft error * log. Something worked... */ sk->sk_err_soft = 0; tp->rcv_tstamp = tcp_time_stamp; prior_packets = tp->packets_out; if (!prior_packets) goto no_queue; /* See if we can take anything off of the retransmit queue. */ flag |= tcp_clean_rtx_queue(sk, prior_fackets); if (tp->frto_counter) frto_cwnd = tcp_process_frto(sk, flag); /* Guarantee sacktag reordering detection against wrap-arounds */ if (before(tp->frto_highmark, tp->snd_una)) tp->frto_highmark = 0; if (tcp_ack_is_dubious(sk, flag)) { /* Advance CWND, if state allows this. */ if ((flag & FLAG_DATA_ACKED) && !frto_cwnd && tcp_may_raise_cwnd(sk, flag)) tcp_cong_avoid(sk, ack, prior_in_flight); tcp_fastretrans_alert(sk, prior_packets - tp->packets_out, flag); } else { if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) tcp_cong_avoid(sk, ack, prior_in_flight); } if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) dst_confirm(sk->sk_dst_cache); return 1; no_queue: icsk->icsk_probes_out = 0; /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. */ if (tcp_send_head(sk)) tcp_ack_probe(sk); return 1; old_ack: if (TCP_SKB_CB(skb)->sacked) { tcp_sacktag_write_queue(sk, skb, prior_snd_una); if (icsk->icsk_ca_state == TCP_CA_Open) tcp_try_keep_open(sk); } uninteresting_ack: SOCK_DEBUG(sk, "Ack %u out of %u:%u\n", ack, tp->snd_una, tp->snd_nxt); return 0; } /* Look for tcp options. Normally only called on SYN and SYNACK packets. * But, this can also be called on packets in the established flow when * the fast version below fails. */ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab) { unsigned char *ptr; struct tcphdr *th = tcp_hdr(skb); int length = (th->doff * 4) - sizeof(struct tcphdr); ptr = (unsigned char *)(th + 1); opt_rx->saw_tstamp = 0; while (length > 0) { int opcode = *ptr++; int opsize; switch (opcode) { case TCPOPT_EOL: return; case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ length--; continue; default: opsize = *ptr++; if (opsize < 2) /* "silly options" */ return; if (opsize > length) return; /* don't parse partial options */ switch (opcode) { case TCPOPT_MSS: if (opsize == TCPOLEN_MSS && th->syn && !estab) { u16 in_mss = get_unaligned_be16(ptr); if (in_mss) { if (opt_rx->user_mss && opt_rx->user_mss < in_mss) in_mss = opt_rx->user_mss; opt_rx->mss_clamp = in_mss; } } break; case TCPOPT_WINDOW: if (opsize == TCPOLEN_WINDOW && th->syn && !estab && sysctl_tcp_window_scaling) { __u8 snd_wscale = *(__u8 *)ptr; opt_rx->wscale_ok = 1; if (snd_wscale > 14) { if (net_ratelimit()) printk(KERN_INFO "tcp_parse_options: Illegal window " "scaling value %d >14 received.\n", snd_wscale); snd_wscale = 14; } opt_rx->snd_wscale = snd_wscale; } break; case TCPOPT_TIMESTAMP: if ((opsize == TCPOLEN_TIMESTAMP) && ((estab && opt_rx->tstamp_ok) || (!estab && sysctl_tcp_timestamps))) { opt_rx->saw_tstamp = 1; opt_rx->rcv_tsval = get_unaligned_be32(ptr); opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4); } break; case TCPOPT_SACK_PERM: if (opsize == TCPOLEN_SACK_PERM && th->syn && !estab && sysctl_tcp_sack) { opt_rx->sack_ok = 1; tcp_sack_reset(opt_rx); } break; case TCPOPT_SACK: if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) && !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) && opt_rx->sack_ok) { TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th; } break; #ifdef CONFIG_TCP_MD5SIG case TCPOPT_MD5SIG: /* * The MD5 Hash has already been * checked (see tcp_v{4,6}_do_rcv()). */ break; #endif } ptr += opsize-2; length -= opsize; } } } /* Fast parse options. This hopes to only see timestamps. * If it is wrong it falls back on tcp_parse_options(). */ static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, struct tcp_sock *tp) { if (th->doff == sizeof(struct tcphdr) >> 2) { tp->rx_opt.saw_tstamp = 0; return 0; } else if (tp->rx_opt.tstamp_ok && th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) { __be32 *ptr = (__be32 *)(th + 1); if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) { tp->rx_opt.saw_tstamp = 1; ++ptr; tp->rx_opt.rcv_tsval = ntohl(*ptr); ++ptr; tp->rx_opt.rcv_tsecr = ntohl(*ptr); return 1; } } tcp_parse_options(skb, &tp->rx_opt, 1); return 1; } static inline void tcp_store_ts_recent(struct tcp_sock *tp) { tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; tp->rx_opt.ts_recent_stamp = get_seconds(); } static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) { if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) { /* PAWS bug workaround wrt. ACK frames, the PAWS discard * extra check below makes sure this can only happen * for pure ACK frames. -DaveM * * Not only, also it occurs for expired timestamps. */ if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 || get_seconds() >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS) tcp_store_ts_recent(tp); } } /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM * * It is not fatal. If this ACK does _not_ change critical state (seqs, window) * it can pass through stack. So, the following predicate verifies that * this segment is not used for anything but congestion avoidance or * fast retransmit. Moreover, we even are able to eliminate most of such * second order effects, if we apply some small "replay" window (~RTO) * to timestamp space. * * All these measures still do not guarantee that we reject wrapped ACKs * on networks with high bandwidth, when sequence space is recycled fastly, * but it guarantees that such events will be very rare and do not affect * connection seriously. This doesn't look nice, but alas, PAWS is really * buggy extension. * * [ Later note. Even worse! It is buggy for segments _with_ data. RFC * states that events when retransmit arrives after original data are rare. * It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is * the biggest problem on large power networks even with minor reordering. * OK, let's give it small replay window. If peer clock is even 1hz, it is safe * up to bandwidth of 18Gigabit/sec. 8) ] */ static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct tcphdr *th = tcp_hdr(skb); u32 seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; return (/* 1. Pure ACK with correct sequence number. */ (th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) && /* 2. ... and duplicate ACK. */ ack == tp->snd_una && /* 3. ... and does not update window. */ !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) && /* 4. ... and sits in replay window. */ (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ); } static inline int tcp_paws_discard(const struct sock *sk, const struct sk_buff *skb) { const struct tcp_sock *tp = tcp_sk(sk); return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW && get_seconds() < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS && !tcp_disordered_ack(sk, skb)); } /* Check segment sequence number for validity. * * Segment controls are considered valid, if the segment * fits to the window after truncation to the window. Acceptability * of data (and SYN, FIN, of course) is checked separately. * See tcp_data_queue(), for example. * * Also, controls (RST is main one) are accepted using RCV.WUP instead * of RCV.NXT. Peer still did not advance his SND.UNA when we * delayed ACK, so that hisSND.UNA<=ourRCV.WUP. * (borrowed from freebsd) */ static inline int tcp_sequence(struct tcp_sock *tp, u32 seq, u32 end_seq) { return !before(end_seq, tp->rcv_wup) && !after(seq, tp->rcv_nxt + tcp_receive_window(tp)); } /* When we get a reset we do this. */ static void tcp_reset(struct sock *sk) { /* We want the right error as BSD sees it (and indeed as we do). */ switch (sk->sk_state) { case TCP_SYN_SENT: sk->sk_err = ECONNREFUSED; break; case TCP_CLOSE_WAIT: sk->sk_err = EPIPE; break; case TCP_CLOSE: return; default: sk->sk_err = ECONNRESET; } if (!sock_flag(sk, SOCK_DEAD)) sk->sk_error_report(sk); tcp_done(sk); } /* * Process the FIN bit. This now behaves as it is supposed to work * and the FIN takes effect when it is validly part of sequence * space. Not before when we get holes. * * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT * (and thence onto LAST-ACK and finally, CLOSE, we never enter * TIME-WAIT) * * If we are in FINWAIT-1, a received FIN indicates simultaneous * close and we go into CLOSING (and later onto TIME-WAIT) * * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT. */ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) { struct tcp_sock *tp = tcp_sk(sk); inet_csk_schedule_ack(sk); sk->sk_shutdown |= RCV_SHUTDOWN; sock_set_flag(sk, SOCK_DONE); switch (sk->sk_state) { case TCP_SYN_RECV: case TCP_ESTABLISHED: /* Move to CLOSE_WAIT */ tcp_set_state(sk, TCP_CLOSE_WAIT); inet_csk(sk)->icsk_ack.pingpong = 1; break; case TCP_CLOSE_WAIT: case TCP_CLOSING: /* Received a retransmission of the FIN, do * nothing. */ break; case TCP_LAST_ACK: /* RFC793: Remain in the LAST-ACK state. */ break; case TCP_FIN_WAIT1: /* This case occurs when a simultaneous close * happens, we must ack the received FIN and * enter the CLOSING state. */ tcp_send_ack(sk); tcp_set_state(sk, TCP_CLOSING); break; case TCP_FIN_WAIT2: /* Received a FIN -- send ACK and enter TIME_WAIT. */ tcp_send_ack(sk); tcp_time_wait(sk, TCP_TIME_WAIT, 0); break; default: /* Only TCP_LISTEN and TCP_CLOSE are left, in these * cases we should never reach this piece of code. */ printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n", __func__, sk->sk_state); break; } /* It _is_ possible, that we have something out-of-order _after_ FIN. * Probably, we should reset in this case. For now drop them. */ __skb_queue_purge(&tp->out_of_order_queue); if (tcp_is_sack(tp)) tcp_sack_reset(&tp->rx_opt); sk_mem_reclaim(sk); if (!sock_flag(sk, SOCK_DEAD)) { sk->sk_state_change(sk); /* Do not send POLL_HUP for half duplex close. */ if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); else sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); } } static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq) { if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) { if (before(seq, sp->start_seq)) sp->start_seq = seq; if (after(end_seq, sp->end_seq)) sp->end_seq = end_seq; return 1; } return 0; } static void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) { if (tcp_is_sack(tp) && sysctl_tcp_dsack) { if (before(seq, tp->rcv_nxt)) NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOLDSENT); else NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFOSENT); tp->rx_opt.dsack = 1; tp->duplicate_sack[0].start_seq = seq; tp->duplicate_sack[0].end_seq = end_seq; tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + 1, 4 - tp->rx_opt.tstamp_ok); } } static void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq) { if (!tp->rx_opt.dsack) tcp_dsack_set(tp, seq, end_seq); else tcp_sack_extend(tp->duplicate_sack, seq, end_seq); } static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST); tcp_enter_quickack_mode(sk); if (tcp_is_sack(tp) && sysctl_tcp_dsack) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) end_seq = tp->rcv_nxt; tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, end_seq); } } tcp_send_ack(sk); } /* These routines update the SACK block as out-of-order packets arrive or * in-order packets close up the sequence space. */ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) { int this_sack; struct tcp_sack_block *sp = &tp->selective_acks[0]; struct tcp_sack_block *swalk = sp + 1; /* See if the recent change to the first SACK eats into * or hits the sequence space of other SACK blocks, if so coalesce. */ for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) { if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) { int i; /* Zap SWALK, by moving every further SACK up by one slot. * Decrease num_sacks. */ tp->rx_opt.num_sacks--; tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok); for (i = this_sack; i < tp->rx_opt.num_sacks; i++) sp[i] = sp[i + 1]; continue; } this_sack++, swalk++; } } static inline void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2) { __u32 tmp; tmp = sack1->start_seq; sack1->start_seq = sack2->start_seq; sack2->start_seq = tmp; tmp = sack1->end_seq; sack1->end_seq = sack2->end_seq; sack2->end_seq = tmp; } static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_sack_block *sp = &tp->selective_acks[0]; int cur_sacks = tp->rx_opt.num_sacks; int this_sack; if (!cur_sacks) goto new_sack; for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) { if (tcp_sack_extend(sp, seq, end_seq)) { /* Rotate this_sack to the first one. */ for (; this_sack > 0; this_sack--, sp--) tcp_sack_swap(sp, sp - 1); if (cur_sacks > 1) tcp_sack_maybe_coalesce(tp); return; } } /* Could not find an adjacent existing SACK, build a new one, * put it at the front, and shift everyone else down. We * always know there is at least one SACK present already here. * * If the sack array is full, forget about the last one. */ if (this_sack >= 4) { this_sack--; tp->rx_opt.num_sacks--; sp--; } for (; this_sack > 0; this_sack--, sp--) *sp = *(sp - 1); new_sack: /* Build the new head SACK, and we're done. */ sp->start_seq = seq; sp->end_seq = end_seq; tp->rx_opt.num_sacks++; tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok); } /* RCV.NXT advances, some SACKs should be eaten. */ static void tcp_sack_remove(struct tcp_sock *tp) { struct tcp_sack_block *sp = &tp->selective_acks[0]; int num_sacks = tp->rx_opt.num_sacks; int this_sack; /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */ if (skb_queue_empty(&tp->out_of_order_queue)) { tp->rx_opt.num_sacks = 0; tp->rx_opt.eff_sacks = tp->rx_opt.dsack; return; } for (this_sack = 0; this_sack < num_sacks;) { /* Check if the start of the sack is covered by RCV.NXT. */ if (!before(tp->rcv_nxt, sp->start_seq)) { int i; /* RCV.NXT must cover all the block! */ BUG_TRAP(!before(tp->rcv_nxt, sp->end_seq)); /* Zap this SACK, by moving forward any other SACKS. */ for (i=this_sack+1; i < num_sacks; i++) tp->selective_acks[i-1] = tp->selective_acks[i]; num_sacks--; continue; } this_sack++; sp++; } if (num_sacks != tp->rx_opt.num_sacks) { tp->rx_opt.num_sacks = num_sacks; tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok); } } /* This one checks to see if we can put data from the * out_of_order queue into the receive_queue. */ static void tcp_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); __u32 dsack_high = tp->rcv_nxt; struct sk_buff *skb; while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) { if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) break; if (before(TCP_SKB_CB(skb)->seq, dsack_high)) { __u32 dsack = dsack_high; if (before(TCP_SKB_CB(skb)->end_seq, dsack_high)) dsack_high = TCP_SKB_CB(skb)->end_seq; tcp_dsack_extend(tp, TCP_SKB_CB(skb)->seq, dsack); } if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { SOCK_DEBUG(sk, "ofo packet was already received \n"); __skb_unlink(skb, &tp->out_of_order_queue); __kfree_skb(skb); continue; } SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n", tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); __skb_unlink(skb, &tp->out_of_order_queue); __skb_queue_tail(&sk->sk_receive_queue, skb); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; if (tcp_hdr(skb)->fin) tcp_fin(skb, sk, tcp_hdr(skb)); } } static int tcp_prune_ofo_queue(struct sock *sk); static int tcp_prune_queue(struct sock *sk); static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size) { if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || !sk_rmem_schedule(sk, size)) { if (tcp_prune_queue(sk) < 0) return -1; if (!sk_rmem_schedule(sk, size)) { if (!tcp_prune_ofo_queue(sk)) return -1; if (!sk_rmem_schedule(sk, size)) return -1; } } return 0; } static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { struct tcphdr *th = tcp_hdr(skb); struct tcp_sock *tp = tcp_sk(sk); int eaten = -1; if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) goto drop; __skb_pull(skb, th->doff * 4); TCP_ECN_accept_cwr(tp, skb); if (tp->rx_opt.dsack) { tp->rx_opt.dsack = 0; tp->rx_opt.eff_sacks = min_t(unsigned int, tp->rx_opt.num_sacks, 4 - tp->rx_opt.tstamp_ok); } /* Queue data for delivery to the user. * Packets in sequence go to the receive queue. * Out of sequence packets to the out_of_order_queue. */ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { if (tcp_receive_window(tp) == 0) goto out_of_window; /* Ok. In sequence. In window. */ if (tp->ucopy.task == current && tp->copied_seq == tp->rcv_nxt && tp->ucopy.len && sock_owned_by_user(sk) && !tp->urg_data) { int chunk = min_t(unsigned int, skb->len, tp->ucopy.len); __set_current_state(TASK_RUNNING); local_bh_enable(); if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) { tp->ucopy.len -= chunk; tp->copied_seq += chunk; eaten = (chunk == skb->len && !th->fin); tcp_rcv_space_adjust(sk); } local_bh_disable(); } if (eaten <= 0) { queue_and_out: if (eaten < 0 && tcp_try_rmem_schedule(sk, skb->truesize)) goto drop; skb_set_owner_r(skb, sk); __skb_queue_tail(&sk->sk_receive_queue, skb); } tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; if (skb->len) tcp_event_data_recv(sk, skb); if (th->fin) tcp_fin(skb, sk, th); if (!skb_queue_empty(&tp->out_of_order_queue)) { tcp_ofo_queue(sk); /* RFC2581. 4.2. SHOULD send immediate ACK, when * gap in queue is filled. */ if (skb_queue_empty(&tp->out_of_order_queue)) inet_csk(sk)->icsk_ack.pingpong = 0; } if (tp->rx_opt.num_sacks) tcp_sack_remove(tp); tcp_fast_path_check(sk); if (eaten > 0) __kfree_skb(skb); else if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk, 0); return; } if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { /* A retransmit, 2nd most common case. Force an immediate ack. */ NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST); tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); out_of_window: tcp_enter_quickack_mode(sk); inet_csk_schedule_ack(sk); drop: __kfree_skb(skb); return; } /* Out of window. F.e. zero window probe. */ if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp))) goto out_of_window; tcp_enter_quickack_mode(sk); if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { /* Partial packet, seq < rcv_next < end_seq */ SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n", tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, tp->rcv_nxt); /* If window is closed, drop tail of packet. But after * remembering D-SACK for its head made in previous line. */ if (!tcp_receive_window(tp)) goto out_of_window; goto queue_and_out; } TCP_ECN_check_ce(tp, skb); if (tcp_try_rmem_schedule(sk, skb->truesize)) goto drop; /* Disable header prediction. */ tp->pred_flags = 0; inet_csk_schedule_ack(sk); SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); skb_set_owner_r(skb, sk); if (!skb_peek(&tp->out_of_order_queue)) { /* Initial out of order segment, build 1 SACK. */ if (tcp_is_sack(tp)) { tp->rx_opt.num_sacks = 1; tp->rx_opt.dsack = 0; tp->rx_opt.eff_sacks = 1;