litmus-rt.git - The LITMUS^RT kernel.

Branch	Commit message	Author	Age
archive/unc-master-3.0	P-FP: fix BUG_ON releated to priority inheritance	Bjoern Brandenburg	13 years
archived-2013.1	uncachedev: mmap memory that is not cached by CPUs	Glenn Elliott	12 years
archived-private-master	Merge branch 'wip-2.6.34' into old-private-master	Andrea Bastoni	15 years
archived-semi-part	Merge branch 'wip-semi-part' of ssh://cvs/cvs/proj/litmus/repo/litmus2010 int...	Andrea Bastoni	15 years
demo	Further refinements	Jonathan Herman	14 years
ecrts-pgm-final	Merge branch 'wip-ecrts14-pgm' of ssh://rtsrv.cs.unc.edu/home/litmus/litmus-r...	Glenn Elliott	12 years
ecrts14-pgm-final	Merge branch 'wip-ecrts14-pgm' of ssh://rtsrv.cs.unc.edu/home/litmus/litmus-r...	Glenn Elliott	12 years
gpusync-rtss12	Final GPUSync implementation.	Glenn Elliott	12 years
gpusync/staging	Rename IKGLP R2DGLP.	Glenn Elliott	12 years
linux-tip	Merge branch 'slab/urgent' of git://git.kernel.org/pub/scm/linux/kernel/git/p...	Linus Torvalds	15 years
litmus2008-patch-series	add i386 feather-trace implementation	Bjoern B. Brandenburg	16 years
master	PSN-EDF: use inferred_sporadic_job_release_at	Bjoern Brandenburg	9 years
pgm	make it compile	Glenn Elliott	12 years
prop/litmus-signals	Infrastructure for Litmus signals.	Glenn Elliott	13 years
prop/robust-tie-break	Fixed bug in edf_higher_prio().	Glenn Elliott	13 years
staging	Fix tracepoint compilation error	Felipe Cerqueira	13 years
test	9/23/2016	Namhoon Kim	9 years
tracing-devel	Test kernel tracing events capabilities	Andrea Bastoni	16 years
v2.6.34-with-arm-patches	smsc911x: Add spinlocks around registers access	Catalin Marinas	15 years
v2015.1	Add ARM syscall def for get_current_budget	Bjoern Brandenburg	10 years
wip-2011.2-bbb	Litmus core: simplify np-section protocol	Bjoern B. Brandenburg	14 years
wip-2011.2-bbb-trace	Refactor sched_trace_log_message() -> debug_trace_log_message()	Andrea Bastoni	14 years
wip-2012.3-gpu	SOBLIV draining support for C-EDF.	Glenn Elliott	12 years
wip-2012.3-gpu-preport	pick up last C-RM file	Glenn Elliott	12 years
wip-2012.3-gpu-rtss13	Fix critical bug in GPU tracker.	Glenn Elliott	12 years
wip-2012.3-gpu-sobliv-budget-w-kshark	Proper sobliv draining and many bug fixes.	Glenn Elliott	12 years
wip-aedzl-final	Make it easier to compile AEDZL interfaces in liblitmus.	Glenn Elliott	15 years
wip-aedzl-revised	Add sched_trace data for Apative EDZL	Glenn Elliott	15 years
wip-arbit-deadline	Fix compilation bug.	Glenn Elliott	13 years
wip-aux-tasks	Description of refined aux task inheritance.	Glenn Elliott	13 years
wip-bbb	GSN-EDF & Core: improve debug TRACE'ing for NP sections	Bjoern B. Brandenburg	14 years
wip-bbb-prio-don	use correct timestamp	Bjoern B. Brandenburg	14 years
wip-better-break	Implement hash-based EDF tie-breaking.	Glenn Elliott	13 years
wip-binary-heap	Make C-EDF work with simplified binheap_delete	Glenn Elliott	13 years
wip-budget	Added support for choices in budget policy enforcement.	Glenn Elliott	15 years
wip-color	Summarize schedulability with final record	Jonathan Herman	13 years
wip-color-jlh	sched_color: Fixed two bugs causing crashing on experiment restart and a rare...	Jonathan Herman	13 years
wip-d10-hz1000	Enable HZ=1000 on District 10	Bjoern B. Brandenburg	15 years
wip-default-clustering	Feature: Make default C-EDF clustering compile-time configurable.	Glenn Elliott	15 years
wip-dissipation-jerickso	Update from 2.6.36 to 2.6.36.4	Jeremy Erickson	11 years
wip-dissipation2-jerickso	Update 2.6.36 to 2.6.36.4	Jeremy Erickson	11 years
wip-ecrts14-pgm	Merge branch 'wip-ecrts14-pgm' of ssh://rtsrv.cs.unc.edu/home/litmus/litmus-r...	Glenn Elliott	12 years
wip-edf-hsb	last tested version	Jonathan Herman	14 years
wip-edf-os	Lookup table EDF-os	Jeremy Erickson	12 years
wip-edf-tie-break	Merge branch 'wip-edf-tie-break' of ssh://rtsrv.cs.unc.edu/home/litmus/litmus...	Glenn Elliott	13 years
wip-edzl-critique	Use hr_timer's active checks instead of having own flag.	Glenn Elliott	15 years
wip-edzl-final	Implementation of the EDZL scheduler.	Glenn Elliott	15 years
wip-edzl-revised	Clean up comments.	Glenn Elliott	15 years
wip-events	Added support for tracing arbitrary actions.	Jonathan Herman	15 years
wip-extra-debug	DBG: add additional tracing	Bjoern B. Brandenburg	15 years
wip-fix-switch-jerickso	Attempt to fix race condition with plugin switching	Jeremy Erickson	15 years
wip-fix3	sched: show length of runqueue clock deactivation in /proc/sched_debug	Bjoern B. Brandenburg	15 years
wip-fmlp-dequeue	Improve FMLP queue management.	Glenn Elliott	14 years
wip-ft-irq-flag	Feather-Trace: keep track of interrupt-related interference.	Bjoern B. Brandenburg	14 years
wip-gpu-cleanup	Enable sched_trace log injection from userspace	Glenn Elliott	13 years
wip-gpu-interrupts	Remove option for threading of all softirqs.	Glenn Elliott	14 years
wip-gpu-rtas12	Generalized GPU cost predictors + EWMA. (untested)	Glenn Elliott	13 years
wip-gpu-rtss12	Final GPUSync implementation.	Glenn Elliott	13 years
wip-gpu-rtss12-srp	experimental changes to support GPUs under SRP	Glenn Elliott	13 years
wip-gpusync-merge	Cleanup priority tracking for budget enforcement.	Glenn Elliott	11 years
wip-ikglp	Move RSM and IKGLP imp. to own .c files	Glenn Elliott	13 years
wip-k-fmlp	Merge branch 'mpi-master' into wip-k-fmlp	Glenn Elliott	14 years
wip-kernel-coloring	Added recolor syscall	Namhoon Kim	7 years
wip-kernthreads	Kludge work-queue processing into klitirqd.	Glenn Elliott	15 years
wip-klmirqd-to-aux	Allow klmirqd threads to be given names.	Glenn Elliott	13 years
wip-kshark	Merge branch 'mpi-staging' into wip-kshark	Jonathan Herman	13 years
wip-litmus-3.2	Merge commit 'v3.2' into litmus-staging	Andrea Bastoni	13 years
wip-litmus2011.2	Cleanup: Coding conformance for affinity stuff.	Glenn Elliott	14 years
wip-litmus3.0-2011.2	Feather-Trace: keep track of interrupt-related interference.	Bjoern B. Brandenburg	14 years
wip-master-2.6.33-rt	Avoid deadlock when switching task policy to BACKGROUND (ugly)	Andrea Bastoni	15 years
wip-mc	Removed ARM-specific hacks which disabled less common mixed-criticality featu...	Jonathan Herman	12 years
wip-mc-bipasa	MC-EDF added	bipasa chattopadhyay	13 years
wip-mc-jerickso	Split C/D queues	Jeremy Erickson	15 years
wip-mc2-cache-slack	Manually patched mc^2 related code	Ming Yang	10 years
wip-mcrit-mac	cosmetic	Mac Mollison	15 years
wip-merge-3.0	Prevent Linux to send IPI and queue tasks on remote CPUs.	Andrea Bastoni	14 years
wip-merge-v3.0	Prevent Linux to send IPI and queue tasks on remote CPUs.	Andrea Bastoni	14 years
wip-migration-affinity	NULL affinity dereference in C-EDF.	Glenn Elliott	14 years
wip-mmap-uncache	share branch with others	Glenn Elliott	13 years
wip-modechange	RTSS 2017 submission	Namhoon Kim	8 years
wip-nested-locking	Appears to be working.	Bryan Ward	12 years
wip-omlp-gedf	First implementation of G-OMLP.	Glenn Elliott	15 years
wip-pai	Some cleanup of PAI	Glenn Elliott	14 years
wip-percore-lib	9/21/2016	Namhoon Kim	9 years
wip-performance	CONFIG_DONT_PREEMPT_ON_TIE: Don't preeempt a scheduled task on priority tie.	Glenn Elliott	14 years
wip-pgm	Add PGM support to C-FL	Glenn Elliott	12 years
wip-pgm-split	First draft of C-FL-split	Namhoon Kim	12 years
wip-pm-ovd	Add preemption-and-migration overhead tracing support	Andrea Bastoni	15 years
wip-prio-inh	P-EDF updated to use the generic pi framework.	Glenn Elliott	15 years
wip-prioq-dgl	BUG FIX: Support DGLs with PRIOQ_MUTEX	Glenn Elliott	13 years
wip-refactored-gedf	Generalizd architecture for GEDF-style scheduelrs to reduce code redundancy.	Glenn Elliott	15 years
wip-release-master-fix	bugfix: release master CPU must signal task was picked	Bjoern B. Brandenburg	14 years
wip-robust-tie-break	EDF priority tie-breaks.	Glenn Elliott	13 years
wip-rt-kshark	Move task time accounting into the complete_job method.	Jonathan Herman	13 years
wip-rtas12-pgm	Scheduling of PGM jobs.	Glenn Elliott	13 years
wip-semi-part	Fix compile error with newer GCC	Jeremy Erickson	12 years
wip-semi-part-edfos-jerickso	Use initial CPU set by client	Jeremy Erickson	12 years
wip-shared-lib	TODO: Fix condition checks in replicate_page_move_mapping()	Namhoon Kim	9 years
wip-shared-lib2	RTAS 2017 Submission ver.	Namhoon Kim	9 years
wip-shared-mem	Initial commit for shared library	Namhoon Kim	9 years
wip-splitting-jerickso	Fix release behavior	Jeremy Erickson	13 years
wip-splitting-omlp-jerickso	Bjoern's Dissertation Code with Priority Donation	Jeremy Erickson	13 years
wip-stage-binheap	An efficient binary heap implementation.	Glenn Elliott	13 years
wip-sun-port	Dynamic memory allocation and clean exit for FeatherTrace	Christopher Kenna	15 years
wip-timer-trace	bugfix: C-EDF, clear scheduled field of the correct CPU upon task_exit	Andrea Bastoni	15 years
wip-tracepoints	Add kernel-style events for sched_trace_XXX() functions	Andrea Bastoni	14 years

Tag	Download	Author	Age
2015.1	commit 8e51b37822...	Bjoern Brandenburg	10 years
2013.1	commit bcaacec1ca...	Glenn Elliott	12 years
2012.3	commit c158b5fbe4...	Jonathan Herman	13 years
2012.2	commit b53c479a0f...	Glenn Elliott	13 years
2012.1	commit 83b11ea1c6...	Bjoern B. Brandenburg	14 years
rtas12-mc-beta-exp	commit 8e236ee20f...	Christopher Kenna	14 years
2011.1	commit d11808b5c6...	Christopher Kenna	15 years
v2.6.37-rc4	commit e8a7e48bb2...	Linus Torvalds	15 years
v2.6.37-rc3	commit 3561d43fd2...	Linus Torvalds	15 years
v2.6.37-rc2	commit e53beacd23...	Linus Torvalds	15 years
v2.6.37-rc1	commit c8ddb2713c...	Linus Torvalds	15 years
v2.6.36	commit f6f94e2ab1...	Linus Torvalds	15 years
2010.2	commit 5c5456402d...	Bjoern B. Brandenburg	15 years
v2.6.36-rc8	commit cd07202cc8...	Linus Torvalds	15 years
v2.6.36-rc7	commit cb655d0f3d...	Linus Torvalds	15 years
v2.6.36-rc6	commit 899611ee7d...	Linus Torvalds	15 years
v2.6.36-rc5	commit b30a3f6257...	Linus Torvalds	15 years
v2.6.36-rc4	commit 49553c2ef8...	Linus Torvalds	15 years
v2.6.36-rc3	commit 2bfc96a127...	Linus Torvalds	15 years
v2.6.36-rc2	commit 76be97c1fc...	Linus Torvalds	15 years
v2.6.36-rc1	commit da5cabf80e...	Linus Torvalds	15 years
v2.6.35	commit 9fe6206f40...	Linus Torvalds	15 years
v2.6.35-rc6	commit b37fa16e78...	Linus Torvalds	15 years
v2.6.35-rc5	commit 1c5474a65b...	Linus Torvalds	15 years
v2.6.35-rc4	commit 815c4163b6...	Linus Torvalds	15 years
v2.6.35-rc3	commit 7e27d6e778...	Linus Torvalds	15 years
v2.6.35-rc2	commit e44a21b726...	Linus Torvalds	15 years
v2.6.35-rc1	commit 67a3e12b05...	Linus Torvalds	15 years
2010.1	commit 7c1ff4c544...	Andrea Bastoni	15 years
v2.6.34	commit e40152ee1e...	Linus Torvalds	15 years
v2.6.33.4	commit 4640b4e7d9...	Greg Kroah-Hartman	15 years
v2.6.34-rc7	commit b57f95a382...	Linus Torvalds	15 years
v2.6.34-rc6	commit 66f41d4c5c...	Linus Torvalds	15 years
v2.6.33.3	commit 3e7ad8ed97...	Greg Kroah-Hartman	15 years
v2.6.34-rc5	commit 01bf0b6457...	Linus Torvalds	15 years
v2.6.34-rc4	commit 0d0fb0f9c5...	Linus Torvalds	15 years
v2.6.33.2	commit 19f00f070c...	Greg Kroah-Hartman	15 years
v2.6.34-rc3	commit 2eaa9cfdf3...	Linus Torvalds	15 years
v2.6.34-rc2	commit 220bf991b0...	Linus Torvalds	15 years
v2.6.33.1	commit dbdafe5ccf...	Greg Kroah-Hartman	16 years
v2.6.34-rc1	commit 57d54889cd...	Linus Torvalds	16 years
v2.6.33	commit 60b341b778...	Linus Torvalds	16 years
v2.6.33-rc8	commit 724e6d3fe8...	Linus Torvalds	16 years
v2.6.33-rc7	commit 29275254ca...	Linus Torvalds	16 years
v2.6.33-rc6	commit abe94c756c...	Linus Torvalds	16 years
v2.6.33-rc5	commit 92dcffb916...	Linus Torvalds	16 years
v2.6.33-rc4	commit 7284ce6c9f...	Linus Torvalds	16 years
v2.6.33-rc3	commit 74d2e4f8d7...	Linus Torvalds	16 years
v2.6.33-rc2	commit 6b7b284958...	Linus Torvalds	16 years
v2.6.33-rc1	commit 55639353a0...	Linus Torvalds	16 years
v2.6.32	commit 22763c5cf3...	Linus Torvalds	16 years
v2.6.32-rc8	commit 648f4e3e50...	Linus Torvalds	16 years
v2.6.32-rc7	commit 156171c71a...	Linus Torvalds	16 years
v2.6.32-rc6	commit b419148e56...	Linus Torvalds	16 years
v2.6.32-rc5	commit 012abeea66...	Linus Torvalds	16 years
v2.6.32-rc4	commit 161291396e...	Linus Torvalds	16 years
v2.6.32-rc3	commit 374576a8b6...	Linus Torvalds	16 years
v2.6.32-rc1	commit 17d857be64...	Linus Torvalds	16 years
v2.6.32-rc2	commit 17d857be64...	Linus Torvalds	16 years
v2.6.31	commit 74fca6a428...	Linus Torvalds	16 years
v2.6.31-rc9	commit e07cccf404...	Linus Torvalds	16 years
v2.6.31-rc8	commit 326ba5010a...	Linus Torvalds	16 years
v2.6.31-rc7	commit 422bef879e...	Linus Torvalds	16 years
v2.6.31-rc6	commit 64f1607ffb...	Linus Torvalds	16 years
v2.6.31-rc5	commit ed680c4ad4...	Linus Torvalds	16 years
v2.6.31-rc4	commit 4be3bd7849...	Linus Torvalds	16 years
v2.6.31-rc3	commit 6847e154e3...	Linus Torvalds	16 years
v2.6.31-rc2	commit 8e4a718ff3...	Linus Torvalds	16 years
v2.6.31-rc1	commit 28d0325ce6...	Linus Torvalds	16 years
v2.6.30	commit 07a2039b8e...	Linus Torvalds	16 years
v2.6.30-rc8	commit 9fa7eb283c...	Linus Torvalds	16 years
v2.6.30-rc7	commit 59a3759d0f...	Linus Torvalds	16 years
v2.6.30-rc6	commit 1406de8e11...	Linus Torvalds	16 years
v2.6.30-rc5	commit 091bf7624d...	Linus Torvalds	16 years
v2.6.30-rc4	commit 091438dd56...	Linus Torvalds	16 years
v2.6.30-rc3	commit 0910697403...	Linus Torvalds	16 years
v2.6.30-rc2	commit 0882e8dd3a...	Linus Torvalds	16 years
v2.6.30-rc1	commit 577c9c456f...	Linus Torvalds	16 years
v2.6.29	commit 8e0ee43bc2...	Linus Torvalds	16 years
v2.6.29-rc8	commit 041b62374c...	Linus Torvalds	17 years
v2.6.29-rc7	commit fec6c6fec3...	Linus Torvalds	17 years
v2.6.29-rc6	commit 20f4d6c3a2...	Linus Torvalds	17 years
v2.6.29-rc5	commit d2f8d7ee1a...	Linus Torvalds	17 years
v2.6.29-rc4	commit 8e4921515c...	Linus Torvalds	17 years
v2.6.29-rc3	commit 18e352e4a7...	Linus Torvalds	17 years
v2.6.29-rc2	commit 1de9e8e70f...	Linus Torvalds	17 years
v2.6.29-rc1	commit c59765042f...	Linus Torvalds	17 years
v2.6.28	commit 4a6908a3a0...	Linus Torvalds	17 years
v2.6.28-rc9	commit 929096fe9f...	Linus Torvalds	17 years
v2.6.28-rc8	commit 8b1fae4e42...	Linus Torvalds	17 years
v2.6.28-rc7	commit 061e41fdb5...	Linus Torvalds	17 years
v2.6.28-rc6	commit 13d428afc0...	Linus Torvalds	17 years
v2.6.28-rc5	commit 9bf1a2445f...	Linus Torvalds	17 years
v2.6.28-rc4	commit f7160c7573...	Linus Torvalds	17 years
v2.6.28-rc3	commit 45beca08dd...	Linus Torvalds	17 years
v2.6.28-rc2	commit 0173a3265b...	Linus Torvalds	17 years
v2.6.28-rc1	commit 57f8f7b60d...	Linus Torvalds	17 years
v2.6.27	commit 3fa8749e58...	Linus Torvalds	17 years
v2.6.27-rc9	commit 4330ed8ed4...	Linus Torvalds	17 years
v2.6.27-rc8	commit 94aca1dac6...	Linus Torvalds	17 years
v2.6.27-rc7	commit 72d31053f6...	Linus Torvalds	17 years
v2.6.27-rc6	commit adee14b2e1...	Linus Torvalds	17 years
v2.6.27-rc5	commit 24342c34a0...	Linus Torvalds	17 years
v2.6.27-rc4	commit 6a55617ed5...	Linus Torvalds	17 years
v2.6.27-rc3	commit 30a2f3c60a...	Linus Torvalds	17 years
v2.6.27-rc2	commit 0967d61ea0...	Linus Torvalds	17 years
v2.6.27-rc1	commit 6e86841d05...	Linus Torvalds	17 years
v2.6.26	commit bce7f793da...	Linus Torvalds	17 years
v2.6.26-rc9	commit b7279469d6...	Linus Torvalds	17 years
v2.6.26-rc8	commit 543cf4cb3f...	Linus Torvalds	17 years
v2.6.26-rc7	commit d70ac829b7...	Linus Torvalds	17 years
v2.6.26-rc6	commit 5dd34572ad...	Linus Torvalds	17 years
v2.6.26-rc5	commit 53c8ba9540...	Linus Torvalds	17 years
v2.6.26-rc4	commit e490517a03...	Linus Torvalds	17 years
v2.6.26-rc3	commit b8291ad07a...	Linus Torvalds	17 years
v2.6.26-rc2	commit 492c2e476e...	Linus Torvalds	17 years
v2.6.26-rc1	commit 2ddcca36c8...	Linus Torvalds	17 years
v2.6.25	commit 4b119e21d0...	Linus Torvalds	17 years
v2.6.25-rc9	commit 120dd64cac...	Linus Torvalds	17 years
v2.6.25-rc8	commit 0e81a8ae37...	Linus Torvalds	17 years
v2.6.25-rc7	commit 05dda977f2...	Linus Torvalds	17 years
v2.6.25-rc6	commit a978b30af3...	Linus Torvalds	17 years
v2.6.25-rc5	commit cdeeeae056...	Linus Torvalds	18 years
v2.6.25-rc4	commit 29e8c3c304...	Linus Torvalds	18 years
v2.6.25-rc3	commit bfa274e243...	Linus Torvalds	18 years
v2.6.25-rc2	commit 101142c37b...	Linus Torvalds	18 years
v2.6.25-rc1	commit 19af35546d...	Linus Torvalds	18 years
v2.6.24	commit 49914084e7...	Linus Torvalds	18 years
v2.6.24-rc8	commit cbd9c88369...	Linus Torvalds	18 years
v2.6.24-rc7	commit 3ce5445046...	Linus Torvalds	18 years
v2.6.24-rc6	commit ea67db4cdb...	Linus Torvalds	18 years
v2.6.24-rc5	commit 82d29bf6dc...	Linus Torvalds	18 years
v2.6.24-rc4	commit 09b56adc98...	Linus Torvalds	18 years
v2.6.24-rc3	commit d9f8bcbf67...	Linus Torvalds	18 years
v2.6.24-rc2	commit dbeeb816e8...	Linus Torvalds	18 years
v2.6.24-rc1	commit c9927c2bf4...	Linus Torvalds	18 years
v2.6.23	commit bbf25010f1...	Linus Torvalds	18 years
v2.6.23-rc9	commit 3146b39c18...	Linus Torvalds	18 years
v2.6.23-rc8	commit 4942de4a0e...	Linus Torvalds	18 years
v2.6.23-rc7	commit 81cfe79b9c...	Linus Torvalds	18 years
v2.6.23-rc6	commit 0d4cbb5e7f...	Linus Torvalds	18 years
v2.6.23-rc5	commit 40ffbfad6b...	Linus Torvalds	18 years
v2.6.23-rc4	commit b07d68b5ca...	Linus Torvalds	18 years
v2.6.23-rc3	commit 39d3520c92...	Linus Torvalds	18 years
v2.6.23-rc2	commit d4ac2477fa...	Linus Torvalds	18 years
v2.6.23-rc1	commit f695baf2df...	Linus Torvalds	18 years
v2.6.22	commit 7dcca30a32...	Linus Torvalds	18 years
v2.6.22-rc7	commit a38d6181ff...	Linus Torvalds	18 years
v2.6.22-rc6	commit 189548642c...	Linus Torvalds	18 years
v2.6.22-rc5	commit 188e1f81ba...	Linus Torvalds	18 years
v2.6.22-rc4	commit 5ecd3100e6...	Linus Torvalds	18 years
v2.6.22-rc3	commit c420bc9f09...	Linus Torvalds	18 years
v2.6.22-rc2	commit 55b637c6a0...	Linus Torvalds	18 years
v2.6.22-rc1	commit 39403865d2...	Linus Torvalds	18 years
v2.6.21	commit de46c33745...	Linus Torvalds	18 years
v2.6.21-rc7	commit 94a05509a9...	Linus Torvalds	18 years
v2.6.21-rc6	commit a21bd69e15...	Linus Torvalds	18 years
v2.6.21-rc5	commit e0f2e3a06b...	Linus Torvalds	18 years
v2.6.21-rc4	commit db98e0b434...	Linus Torvalds	19 years
v2.6.21-rc3	commit 08e15e81a4...	Linus Torvalds	19 years
v2.6.21-rc2	commit 606135a308...	Linus Torvalds	19 years
v2.6.21-rc1	commit c8f71b01a5...	Linus Torvalds	19 years
v2.6.20	commit 62d0cfcb27...	Linus Torvalds	19 years
v2.6.20-rc7	commit f56df2f4db...	Linus Torvalds	19 years
v2.6.20-rc6	commit 99abfeafb5...	Linus Torvalds	19 years
v2.6.20-rc5	commit a8b3485287...	Linus Torvalds	19 years
v2.6.20-rc4	commit bf81b46482...	Linus Torvalds	19 years
v2.6.20-rc3	commit 669df1b478...	Linus Torvalds	19 years
v2.6.20-rc2	commit 3bf8ba38f3...	Linus Torvalds	19 years
v2.6.20-rc1	commit cc016448b0...	Linus Torvalds	19 years
v2.6.19	commit 0215ffb08c...	Linus Torvalds	19 years
v2.6.19-rc6	commit 44597f65f6...	Linus Torvalds	19 years
v2.6.19-rc5	commit 80c2188127...	Linus Torvalds	19 years
v2.6.19-rc4	commit ae99a78af3...	Linus Torvalds	19 years
v2.6.19-rc3	commit 7059abedd2...	Linus Torvalds	19 years
v2.6.19-rc2	commit b4bd8c6643...	Linus Torvalds	19 years
v2.6.19-rc1	commit d223a60106...	Linus Torvalds	19 years
v2.6.18	commit e478bec0ba...	Linus Torvalds	19 years
v2.6.18-rc7	commit 95064a75eb...	Linus Torvalds	19 years
v2.6.18-rc6	commit c336923b66...	Linus Torvalds	19 years
v2.6.18-rc5	commit 60d4684068...	Linus Torvalds	19 years
v2.6.18-rc4	commit 9f737633e6...	Linus Torvalds	19 years
v2.6.18-rc3	commit b6ff50833a...	Linus Torvalds	19 years
v2.6.18-rc2	commit 82d6897fef...	Linus Torvalds	19 years
v2.6.18-rc1	commit 120bda20c6...	Linus Torvalds	19 years
v2.6.17	commit 427abfa28a...	Linus Torvalds	19 years
v2.6.17-rc6	commit 1def630a6a...	Linus Torvalds	19 years
v2.6.17-rc5	commit a8bd60705a...	Linus Torvalds	19 years
v2.6.17-rc4	commit d8c3291c73...	Linus Torvalds	19 years
v2.6.17-rc3	commit 2be4d50295...	Linus Torvalds	19 years
v2.6.17-rc2	commit 8bbde0e6d5...	Linus Torvalds	19 years
v2.6.17-rc1	commit 6246b6128b...	Linus Torvalds	19 years
v2.6.16	commit 7705a8792b...	Linus Torvalds	19 years
v2.6.16-rc6	commit 535744878e...	Linus Torvalds	20 years
v2.6.16-rc5	commit b9a33cebac...	Linus Torvalds	20 years
v2.6.16-rc4	commit bd71c2b174...	Linus Torvalds	20 years
v2.6.16-rc3	commit e9bb4c9929...	Linus Torvalds	20 years
v2.6.16-rc2	commit 826eeb53a6...	Linus Torvalds	20 years
v2.6.16-rc1	commit 2664b25051...	Linus Torvalds	20 years
v2.6.15	commit 88026842b0...	Linus Torvalds	20 years
v2.6.15-rc7	commit f89f5948fc...	Linus Torvalds	20 years
v2.6.15-rc6	commit df7addbb45...	Linus Torvalds	20 years
v2.6.15-rc5	commit 436b0f76f2...	Linus Torvalds	20 years
v2.6.15-rc4	commit 5666c0947e...	Linus Torvalds	20 years
v2.6.15-rc3	commit 624f54be20...	Linus Torvalds	20 years
v2.6.15-rc2	commit 3bedff1d73...	Linus Torvalds	20 years
v2.6.15-rc1	commit cd52d1ee9a...	Linus Torvalds	20 years
v2.6.14	commit 741b2252a5...	Linus Torvalds	20 years
v2.6.14-rc5	commit 93918e9afc...	Linus Torvalds	20 years
v2.6.14-rc4	commit 907a426179...	Linus Torvalds	20 years
v2.6.14-rc3	commit 1c9426e8a5...	Linus Torvalds	20 years
v2.6.14-rc2	commit 676d55ae30...	Linus Torvalds	20 years
v2.6.14-rc1	commit 2f4ba45a75...	Linus Torvalds	20 years
v2.6.13	commit 02b3e4e2d7...	Linus Torvalds	20 years
v2.6.13-rc7	commit 0572e3da3f...	Linus Torvalds	20 years
v2.6.13-rc6	commit 6fc32179de...	Linus Torvalds	20 years
v2.6.13-rc5	commit 9a351e30d7...	Linus Torvalds	20 years
v2.6.13-rc4	commit 6395352334...	Linus Torvalds	20 years
v2.6.11	tree c39ae07f39...
v2.6.11-tree	tree c39ae07f39...
v2.6.12	commit 9ee1c939d1...
v2.6.12-rc2	commit 1da177e4c3...
v2.6.12-rc3	commit a2755a80f4...
v2.6.12-rc4	commit 88d7bd8cb9...
v2.6.12-rc5	commit 2a24ab628a...
v2.6.12-rc6	commit 7cef5677ef...
v2.6.13-rc1	commit 4c91aedb75...
v2.6.13-rc2	commit a18bcb7450...
v2.6.13-rc3	commit c32511e271...

/* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * PACKET - implements raw packet sockets. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox, <gw4pts@gw4pts.ampr.org> * * Fixes: * Alan Cox : verify_area() now used correctly * Alan Cox : new skbuff lists, look ma no backlogs! * Alan Cox : tidied skbuff lists. * Alan Cox : Now uses generic datagram routines I * added. Also fixed the peek/read crash * from all old Linux datagram code. * Alan Cox : Uses the improved datagram code. * Alan Cox : Added NULL's for socket options. * Alan Cox : Re-commented the code. * Alan Cox : Use new kernel side addressing * Rob Janssen : Correct MTU usage. * Dave Platt : Counter leaks caused by incorrect * interrupt locking and some slightly * dubious gcc output. Can you read * compiler: it said _VOLATILE_ * Richard Kooijman : Timestamp fixes. * Alan Cox : New buffers. Use sk->mac.raw. * Alan Cox : sendmsg/recvmsg support. * Alan Cox : Protocol setting support * Alexey Kuznetsov : Untied from IPv4 stack. * Cyrus Durgin : Fixed kerneld for kmod. * Michal Ostrowski : Module initialization cleanup. * Ulises Alonso : Frame number limit removal and * packet_set_ring memory leak. * Eric Biederman : Allow for > 8 byte hardware addresses. * The convention is that longer addresses * will simply extend the hardware address * byte arrays at the end of sockaddr_ll * and packet_mreq. * Johann Baudy : Added TX RING. * Chetan Loke : Implemented TPACKET_V3 block abstraction * layer. * Copyright (C) 2011, <lokec@ccs.neu.edu> * * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * */ #include <linux/types.h> #include <linux/mm.h> #include <linux/capability.h> #include <linux/fcntl.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_packet.h> #include <linux/wireless.h> #include <linux/kernel.h> #include <linux/kmod.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/errno.h> #include <linux/timer.h> #include <asm/uaccess.h> #include <asm/ioctls.h> #include <asm/page.h> #include <asm/cacheflush.h> #include <asm/io.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/poll.h> #include <linux/module.h> #include <linux/init.h> #include <linux/mutex.h> #include <linux/if_vlan.h> #include <linux/virtio_net.h> #include <linux/errqueue.h> #include <linux/net_tstamp.h> #include <linux/reciprocal_div.h> #ifdef CONFIG_INET #include <net/inet_common.h> #endif #include "internal.h" /* Assumptions: - if device has no dev->hard_header routine, it adds and removes ll header inside itself. In this case ll header is invisible outside of device, but higher levels still should reserve dev->hard_header_len. Some devices are enough clever to reallocate skb, when header will not fit to reserved space (tunnel), another ones are silly (PPP). - packet socket receives packets with pulled ll header, so that SOCK_RAW should push it back. On receive: ----------- Incoming, dev->hard_header!=NULL mac_header -> ll header data -> data Outgoing, dev->hard_header!=NULL mac_header -> ll header data -> ll header Incoming, dev->hard_header==NULL mac_header -> UNKNOWN position. It is very likely, that it points to ll header. PPP makes it, that is wrong, because introduce assymetry between rx and tx paths. data -> data Outgoing, dev->hard_header==NULL mac_header -> data. ll header is still not built! data -> data Resume If dev->hard_header==NULL we are unlikely to restore sensible ll header. On transmit: ------------ dev->hard_header != NULL mac_header -> ll header data -> ll header dev->hard_header == NULL (ll header is added by device, we cannot control it) mac_header -> data data -> data We should set nh.raw on output to correct posistion, packet classifier depends on it. */ /* Private packet socket structures. */ /* identical to struct packet_mreq except it has * a longer address field. */ struct packet_mreq_max { int mr_ifindex; unsigned short mr_type; unsigned short mr_alen; unsigned char mr_address[MAX_ADDR_LEN]; }; union tpacket_uhdr { struct tpacket_hdr *h1; struct tpacket2_hdr *h2; struct tpacket3_hdr *h3; void *raw; }; static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, int closing, int tx_ring); #define V3_ALIGNMENT (8) #define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT)) #define BLK_PLUS_PRIV(sz_of_priv) \ (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT)) #define PGV_FROM_VMALLOC 1 #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status) #define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts) #define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt) #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len) #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num) #define BLOCK_O2PRIV(x) ((x)->offset_to_priv) #define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x))) struct packet_sock; static int tpacket_snd(struct packet_sock *po, struct msghdr *msg); static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); static void *packet_previous_frame(struct packet_sock *po, struct packet_ring_buffer *rb, int status); static void packet_increment_head(struct packet_ring_buffer *buff); static int prb_curr_blk_in_use(struct tpacket_kbdq_core *, struct tpacket_block_desc *); static void *prb_dispatch_next_block(struct tpacket_kbdq_core *, struct packet_sock *); static void prb_retire_current_block(struct tpacket_kbdq_core *, struct packet_sock *, unsigned int status); static int prb_queue_frozen(struct tpacket_kbdq_core *); static void prb_open_block(struct tpacket_kbdq_core *, struct tpacket_block_desc *); static void prb_retire_rx_blk_timer_expired(unsigned long); static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *); static void prb_init_blk_timer(struct packet_sock *, struct tpacket_kbdq_core *, void (*func) (unsigned long)); static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void prb_clear_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void prb_fill_vlan_info(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void packet_flush_mclist(struct sock *sk); struct packet_skb_cb { unsigned int origlen; union { struct sockaddr_pkt pkt; struct sockaddr_ll ll; } sa; }; #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc)) #define GET_PBLOCK_DESC(x, bid) \ ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer)) #define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \ ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer)) #define GET_NEXT_PRB_BLK_NUM(x) \ (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \ ((x)->kactive_blk_num+1) : 0) static void __fanout_unlink(struct sock *sk, struct packet_sock *po); static void __fanout_link(struct sock *sk, struct packet_sock *po); /* register_prot_hook must be invoked with the po->bind_lock held, * or from a context in which asynchronous accesses to the packet * socket is not possible (packet_create()). */ static void register_prot_hook(struct sock *sk) { struct packet_sock *po = pkt_sk(sk); if (!po->running) { if (po->fanout) { __fanout_link(sk, po); } else { dev_add_pack(&po->prot_hook); rcu_assign_pointer(po->cached_dev, po->prot_hook.dev); } sock_hold(sk); po->running = 1; } } /* {,__}unregister_prot_hook() must be invoked with the po->bind_lock * held. If the sync parameter is true, we will temporarily drop * the po->bind_lock and do a synchronize_net to make sure no * asynchronous packet processing paths still refer to the elements * of po->prot_hook. If the sync parameter is false, it is the * callers responsibility to take care of this. */ static void __unregister_prot_hook(struct sock *sk, bool sync) { struct packet_sock *po = pkt_sk(sk); po->running = 0; if (po->fanout) { __fanout_unlink(sk, po); } else { __dev_remove_pack(&po->prot_hook); RCU_INIT_POINTER(po->cached_dev, NULL); } __sock_put(sk); if (sync) { spin_unlock(&po->bind_lock); synchronize_net(); spin_lock(&po->bind_lock); } } static void unregister_prot_hook(struct sock *sk, bool sync) { struct packet_sock *po = pkt_sk(sk); if (po->running) __unregister_prot_hook(sk, sync); } static inline __pure struct page *pgv_to_page(void *addr) { if (is_vmalloc_addr(addr)) return vmalloc_to_page(addr); return virt_to_page(addr); } static void __packet_set_status(struct packet_sock *po, void *frame, int status) { union tpacket_uhdr h; h.raw = frame; switch (po->tp_version) { case TPACKET_V1: h.h1->tp_status = status; flush_dcache_page(pgv_to_page(&h.h1->tp_status)); break; case TPACKET_V2: h.h2->tp_status = status; flush_dcache_page(pgv_to_page(&h.h2->tp_status)); break; case TPACKET_V3: default: WARN(1, "TPACKET version not supported.\n"); BUG(); } smp_wmb(); } static int __packet_get_status(struct packet_sock *po, void *frame) { union tpacket_uhdr h; smp_rmb(); h.raw = frame; switch (po->tp_version) { case TPACKET_V1: flush_dcache_page(pgv_to_page(&h.h1->tp_status)); return h.h1->tp_status; case TPACKET_V2: flush_dcache_page(pgv_to_page(&h.h2->tp_status)); return h.h2->tp_status; case TPACKET_V3: default: WARN(1, "TPACKET version not supported.\n"); BUG(); return 0; } } static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts, unsigned int flags) { struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); if (shhwtstamps) { if ((flags & SOF_TIMESTAMPING_SYS_HARDWARE) && ktime_to_timespec_cond(shhwtstamps->syststamp, ts)) return TP_STATUS_TS_SYS_HARDWARE; if ((flags & SOF_TIMESTAMPING_RAW_HARDWARE) && ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts)) return TP_STATUS_TS_RAW_HARDWARE; } if (ktime_to_timespec_cond(skb->tstamp, ts)) return TP_STATUS_TS_SOFTWARE; return 0; } static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame, struct sk_buff *skb) { union tpacket_uhdr h; struct timespec ts; __u32 ts_status; if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp))) return 0; h.raw = frame; switch (po->tp_version) { case TPACKET_V1: h.h1->tp_sec = ts.tv_sec; h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; break; case TPACKET_V2: h.h2->tp_sec = ts.tv_sec; h.h2->tp_nsec = ts.tv_nsec; break; case TPACKET_V3: default: WARN(1, "TPACKET version not supported.\n"); BUG(); } /* one flush is safe, as both fields always lie on the same cacheline */ flush_dcache_page(pgv_to_page(&h.h1->tp_sec)); smp_wmb(); return ts_status; } static void *packet_lookup_frame(struct packet_sock *po, struct packet_ring_buffer *rb, unsigned int position, int status) { unsigned int pg_vec_pos, frame_offset; union tpacket_uhdr h; pg_vec_pos = position / rb->frames_per_block; frame_offset = position % rb->frames_per_block; h.raw = rb->pg_vec[pg_vec_pos].buffer + (frame_offset * rb->frame_size); if (status != __packet_get_status(po, h.raw)) return NULL; return h.raw; } static void *packet_current_frame(struct packet_sock *po, struct packet_ring_buffer *rb, int status) { return packet_lookup_frame(po, rb, rb->head, status); } static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc) { del_timer_sync(&pkc->retire_blk_timer); } static void prb_shutdown_retire_blk_timer(struct packet_sock *po, int tx_ring, struct sk_buff_head *rb_queue) { struct tpacket_kbdq_core *pkc; pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc; spin_lock(&rb_queue->lock); pkc->delete_blk_timer = 1; spin_unlock(&rb_queue->lock); prb_del_retire_blk_timer(pkc); } static void prb_init_blk_timer(struct packet_sock *po, struct tpacket_kbdq_core *pkc, void (*func) (unsigned long)) { init_timer(&pkc->retire_blk_timer); pkc->retire_blk_timer.data = (long)po; pkc->retire_blk_timer.function = func; pkc->retire_blk_timer.expires = jiffies; } static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring) { struct tpacket_kbdq_core *pkc; if (tx_ring) BUG(); pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc; prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired); } static int prb_calc_retire_blk_tmo(struct packet_sock *po, int blk_size_in_bytes) { struct net_device *dev; unsigned int mbits = 0, msec = 0, div = 0, tmo = 0; struct ethtool_cmd ecmd; int err; u32 speed; rtnl_lock(); dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex); if (unlikely(!dev)) { rtnl_unlock(); return DEFAULT_PRB_RETIRE_TOV; } err = __ethtool_get_settings(dev, &ecmd); speed = ethtool_cmd_speed(&ecmd); rtnl_unlock(); if (!err) { /* * If the link speed is so slow you don't really * need to worry about perf anyways */ if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) { return DEFAULT_PRB_RETIRE_TOV; } else { msec = 1; div = speed / 1000; } } mbits = (blk_size_in_bytes * 8) / (1024 * 1024); if (div) mbits /= div; tmo = mbits * msec; if (div) return tmo+1; return tmo; } static void prb_init_ft_ops(struct tpacket_kbdq_core *p1, union tpacket_req_u *req_u) { p1->feature_req_word = req_u->req3.tp_feature_req_word; } static void init_prb_bdqc(struct packet_sock *po, struct packet_ring_buffer *rb, struct pgv *pg_vec, union tpacket_req_u *req_u, int tx_ring) { struct tpacket_kbdq_core *p1 = &rb->prb_bdqc; struct tpacket_block_desc *pbd; memset(p1, 0x0, sizeof(*p1)); p1->knxt_seq_num = 1; p1->pkbdq = pg_vec; pbd = (struct tpacket_block_desc *)pg_vec[0].buffer; p1->pkblk_start = pg_vec[0].buffer; p1->kblk_size = req_u->req3.tp_block_size; p1->knum_blocks = req_u->req3.tp_block_nr; p1->hdrlen = po->tp_hdrlen; p1->version = po->tp_version; p1->last_kactive_blk_num = 0; po->stats.stats3.tp_freeze_q_cnt = 0; if (req_u->req3.tp_retire_blk_tov) p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov; else p1->retire_blk_tov = prb_calc_retire_blk_tmo(po, req_u->req3.tp_block_size); p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov); p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv; prb_init_ft_ops(p1, req_u); prb_setup_retire_blk_timer(po, tx_ring); prb_open_block(p1, pbd); } /* Do NOT update the last_blk_num first. * Assumes sk_buff_head lock is held. */ static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc) { mod_timer(&pkc->retire_blk_timer, jiffies + pkc->tov_in_jiffies); pkc->last_kactive_blk_num = pkc->kactive_blk_num; } /* * Timer logic: * 1) We refresh the timer only when we open a block. * By doing this we don't waste cycles refreshing the timer * on packet-by-packet basis. * * With a 1MB block-size, on a 1Gbps line, it will take * i) ~8 ms to fill a block + ii) memcpy etc. * In this cut we are not accounting for the memcpy time. * * So, if the user sets the 'tmo' to 10ms then the timer * will never fire while the block is still getting filled * (which is what we want). However, the user could choose * to close a block early and that's fine. * * But when the timer does fire, we check whether or not to refresh it. * Since the tmo granularity is in msecs, it is not too expensive * to refresh the timer, lets say every '8' msecs. * Either the user can set the 'tmo' or we can derive it based on * a) line-speed and b) block-size. * prb_calc_retire_blk_tmo() calculates the tmo. * */ static void prb_retire_rx_blk_timer_expired(unsigned long data) { struct packet_sock *po = (struct packet_sock *)data; struct tpacket_kbdq_core *pkc = &po->rx_ring.prb_bdqc; unsigned int frozen; struct tpacket_block_desc *pbd; spin_lock(&po->sk.sk_receive_queue.lock); frozen = prb_queue_frozen(pkc); pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); if (unlikely(pkc->delete_blk_timer)) goto out; /* We only need to plug the race when the block is partially filled. * tpacket_rcv: * lock(); increment BLOCK_NUM_PKTS; unlock() * copy_bits() is in progress ... * timer fires on other cpu: * we can't retire the current block because copy_bits * is in progress. * */ if (BLOCK_NUM_PKTS(pbd)) { while (atomic_read(&pkc->blk_fill_in_prog)) { /* Waiting for skb_copy_bits to finish... */ cpu_relax(); } } if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) { if (!frozen) { prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO); if (!prb_dispatch_next_block(pkc, po)) goto refresh_timer; else goto out; } else { /* Case 1. Queue was frozen because user-space was * lagging behind. */ if (prb_curr_blk_in_use(pkc, pbd)) { /* * Ok, user-space is still behind. * So just refresh the timer. */ goto refresh_timer; } else { /* Case 2. queue was frozen,user-space caught up, * now the link went idle && the timer fired. * We don't have a block to close.So we open this * block and restart the timer. * opening a block thaws the queue,restarts timer * Thawing/timer-refresh is a side effect. */ prb_open_block(pkc, pbd); goto out; } } } refresh_timer: _prb_refresh_rx_retire_blk_timer(pkc); out: spin_unlock(&po->sk.sk_receive_queue.lock); } static void prb_flush_block(struct tpacket_kbdq_core *pkc1, struct tpacket_block_desc *pbd1, __u32 status) { /* Flush everything minus the block header */ #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 u8 *start, *end; start = (u8 *)pbd1; /* Skip the block header(we know header WILL fit in 4K) */ start += PAGE_SIZE; end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end); for (; start < end; start += PAGE_SIZE) flush_dcache_page(pgv_to_page(start)); smp_wmb(); #endif /* Now update the block status. */ BLOCK_STATUS(pbd1) = status; /* Flush the block header */ #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 start = (u8 *)pbd1; flush_dcache_page(pgv_to_page(start)); smp_wmb(); #endif } /* * Side effect: * * 1) flush the block * 2) Increment active_blk_num * * Note:We DONT refresh the timer on purpose. * Because almost always the next block will be opened. */ static void prb_close_block(struct tpacket_kbdq_core *pkc1, struct tpacket_block_desc *pbd1, struct packet_sock *po, unsigned int stat) { __u32 status = TP_STATUS_USER | stat; struct tpacket3_hdr *last_pkt; struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; if (po->stats.stats3.tp_drops) status |= TP_STATUS_LOSING; last_pkt = (struct tpacket3_hdr *)pkc1->prev; last_pkt->tp_next_offset = 0; /* Get the ts of the last pkt */ if (BLOCK_NUM_PKTS(pbd1)) { h1->ts_last_pkt.ts_sec = last_pkt->tp_sec; h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec; } else { /* Ok, we tmo'd - so get the current time */ struct timespec ts; getnstimeofday(&ts); h1->ts_last_pkt.ts_sec = ts.tv_sec; h1->ts_last_pkt.ts_nsec = ts.tv_nsec; } smp_wmb(); /* Flush the block */ prb_flush_block(pkc1, pbd1, status); pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1); } static void prb_thaw_queue(struct tpacket_kbdq_core *pkc) { pkc->reset_pending_on_curr_blk = 0; } /* * Side effect of opening a block: * * 1) prb_queue is thawed. * 2) retire_blk_timer is refreshed. * */ static void prb_open_block(struct tpacket_kbdq_core *pkc1, struct tpacket_block_desc *pbd1) { struct timespec ts; struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; smp_rmb(); /* We could have just memset this but we will lose the * flexibility of making the priv area sticky */ BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++; BLOCK_NUM_PKTS(pbd1) = 0; BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); getnstimeofday(&ts); h1->ts_first_pkt.ts_sec = ts.tv_sec; h1->ts_first_pkt.ts_nsec = ts.tv_nsec; pkc1->pkblk_start = (char *)pbd1; pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN; pbd1->version = pkc1->version; pkc1->prev = pkc1->nxt_offset; pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size; prb_thaw_queue(pkc1); _prb_refresh_rx_retire_blk_timer(pkc1); smp_wmb(); } /* * Queue freeze logic: * 1) Assume tp_block_nr = 8 blocks. * 2) At time 't0', user opens Rx ring. * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7 * 4) user-space is either sleeping or processing block '0'. * 5) tpacket_rcv is currently filling block '7', since there is no space left, * it will close block-7,loop around and try to fill block '0'. * call-flow: * __packet_lookup_frame_in_block * prb_retire_current_block() * prb_dispatch_next_block() * |->(BLOCK_STATUS == USER) evaluates to true * 5.1) Since block-0 is currently in-use, we just freeze the queue. * 6) Now there are two cases: * 6.1) Link goes idle right after the queue is frozen. * But remember, the last open_block() refreshed the timer. * When this timer expires,it will refresh itself so that we can * re-open block-0 in near future. * 6.2) Link is busy and keeps on receiving packets. This is a simple * case and __packet_lookup_frame_in_block will check if block-0 * is free and can now be re-used. */ static void prb_freeze_queue(struct tpacket_kbdq_core *pkc, struct packet_sock *po) { pkc->reset_pending_on_curr_blk = 1; po->stats.stats3.tp_freeze_q_cnt++; } #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT)) /* * If the next block is free then we will dispatch it * and return a good offset. * Else, we will freeze the queue. * So, caller must check the return value. */ static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc, struct packet_sock *po) { struct tpacket_block_desc *pbd; smp_rmb(); /* 1. Get current block num */ pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); /* 2. If this block is currently in_use then freeze the queue */ if (TP_STATUS_USER & BLOCK_STATUS(pbd)) { prb_freeze_queue(pkc, po); return NULL; } /* * 3. * open this block and return the offset where the first packet * needs to get stored. */ prb_open_block(pkc, pbd); return (void *)pkc->nxt_offset; } static void prb_retire_current_block(struct tpacket_kbdq_core *pkc, struct packet_sock *po, unsigned int status) { struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); /* retire/close the current block */ if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) { /* * Plug the case where copy_bits() is in progress on * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't * have space to copy the pkt in the current block and * called prb_retire_current_block() * * We don't need to worry about the TMO case because * the timer-handler already handled this case. */ if (!(status & TP_STATUS_BLK_TMO)) { while (atomic_read(&pkc->blk_fill_in_prog)) { /* Waiting for skb_copy_bits to finish... */ cpu_relax(); } } prb_close_block(pkc, pbd, po, status); return; } } static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc, struct tpacket_block_desc *pbd) { return TP_STATUS_USER & BLOCK_STATUS(pbd); } static int prb_queue_frozen(struct tpacket_kbdq_core *pkc) { return pkc->reset_pending_on_curr_blk; } static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb) { struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); atomic_dec(&pkc->blk_fill_in_prog); } static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { ppd->hv1.tp_rxhash = skb_get_rxhash(pkc->skb); } static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { ppd->hv1.tp_rxhash = 0; } static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { if (vlan_tx_tag_present(pkc->skb)) { ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb); ppd->tp_status = TP_STATUS_VLAN_VALID; } else { ppd->hv1.tp_vlan_tci = 0; ppd->tp_status = TP_STATUS_AVAILABLE; } } static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { prb_fill_vlan_info(pkc, ppd); if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH) prb_fill_rxhash(pkc, ppd); else prb_clear_rxhash(pkc, ppd); } static void prb_fill_curr_block(char *curr, struct tpacket_kbdq_core *pkc, struct tpacket_block_desc *pbd, unsigned int len) { struct tpacket3_hdr *ppd; ppd = (struct tpacket3_hdr *)curr; ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len); pkc->prev = curr; pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len); BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len); BLOCK_NUM_PKTS(pbd) += 1; atomic_inc(&pkc->blk_fill_in_prog); prb_run_all_ft_ops(pkc, ppd); } /* Assumes caller has the sk->rx_queue.lock */ static void *__packet_lookup_frame_in_block(struct packet_sock *po, struct sk_buff *skb, int status, unsigned int len ) { struct tpacket_kbdq_core *pkc; struct tpacket_block_desc *pbd; char *curr, *end; pkc = GET_PBDQC_FROM_RB(&po->rx_ring); pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); /* Queue is frozen when user space is lagging behind */ if (prb_queue_frozen(pkc)) { /* * Check if that last block which caused the queue to freeze, * is still in_use by user-space. */ if (prb_curr_blk_in_use(pkc, pbd)) { /* Can't record this packet */ return NULL; } else { /* * Ok, the block was released by user-space. * Now let's open that block. * opening a block also thaws the queue. * Thawing is a side effect. */ prb_open_block(pkc, pbd); } } smp_mb(); curr = pkc->nxt_offset; pkc->skb = skb; end = (char *)pbd + pkc->kblk_size; /* first try the current block */ if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) { prb_fill_curr_block(curr, pkc, pbd, len); return (void *)curr; } /* Ok, close the current block */ prb_retire_current_block(pkc, po, 0); /* Now, try to dispatch the next block */ curr = (char *)prb_dispatch_next_block(pkc, po); if (curr) { pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); prb_fill_curr_block(curr, pkc, pbd, len); return (void *)curr; } /* * No free blocks are available.user_space hasn't caught up yet. * Queue was just frozen and now this packet will get dropped. */ return NULL; } static void *packet_current_rx_frame(struct packet_sock *po, struct sk_buff *skb, int status, unsigned int len) { char *curr = NULL; switch (po->tp_version) { case TPACKET_V1: case TPACKET_V2: curr = packet_lookup_frame(po, &po->rx_ring, po->rx_ring.head, status); return curr; case TPACKET_V3: return __packet_lookup_frame_in_block(po, skb, status, len); default: WARN(1, "TPACKET version not supported\n"); BUG(); return NULL; } } static void *prb_lookup_block(struct packet_sock *po, struct packet_ring_buffer *rb, unsigned int idx, int status) { struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx); if (status != BLOCK_STATUS(pbd)) return NULL; return pbd; } static int prb_previous_blk_num(struct packet_ring_buffer *rb) { unsigned int prev; if (rb->prb_bdqc.kactive_blk_num) prev = rb->prb_bdqc.kactive_blk_num-1; else prev = rb->prb_bdqc.knum_blocks-1; return prev; } /* Assumes caller has held the rx_queue.lock */ static void *__prb_previous_block(struct packet_sock *po, struct packet_ring_buffer *rb, int status) { unsigned int previous = prb_previous_blk_num(rb); return prb_lookup_block(po, rb, previous, status); } static void *packet_previous_rx_frame(struct packet_sock *po, struct packet_ring_buffer *rb, int status) { if (po->tp_version <= TPACKET_V2) return packet_previous_frame(po, rb, status); return __prb_previous_block(po, rb, status); } static void packet_increment_rx_head(struct packet_sock *po, struct packet_ring_buffer *rb) { switch (po->tp_version) { case TPACKET_V1: case TPACKET_V2: return packet_increment_head(rb); case TPACKET_V3: default: WARN(1, "TPACKET version not supported.\n"); BUG(); return; } } static void *packet_previous_frame(struct packet_sock *po, struct packet_ring_buffer *rb, int status) { unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max; return packet_lookup_frame(po, rb, previous, status); } static void packet_increment_head(struct packet_ring_buffer *buff) { buff->head = buff->head != buff->frame_max ? buff->head+1 : 0; } static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) { struct sock *sk = &po->sk; bool has_room; if (po->prot_hook.func != tpacket_rcv) return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize) <= sk->sk_rcvbuf; spin_lock(&sk->sk_receive_queue.lock); if (po->tp_version == TPACKET_V3) has_room = prb_lookup_block(po, &po->rx_ring, po->rx_ring.prb_bdqc.kactive_blk_num, TP_STATUS_KERNEL); else has_room = packet_lookup_frame(po, &po->rx_ring, po->rx_ring.head, TP_STATUS_KERNEL); spin_unlock(&sk->sk_receive_queue.lock); return has_room; } static void packet_sock_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_error_queue); WARN_ON(atomic_read(&sk->sk_rmem_alloc)); WARN_ON(atomic_read(&sk->sk_wmem_alloc)); if (!sock_flag(sk, SOCK_DEAD)) { pr_err("Attempt to release alive packet socket: %p\n", sk); return; } sk_refcnt_debug_dec(sk); } static int fanout_rr_next(struct packet_fanout *f, unsigned int num) { int x = atomic_read(&f->rr_cur) + 1; if (x >= num) x = 0; return x; } static unsigned int fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { return reciprocal_divide(skb->rxhash, num); } static unsigned int fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { int cur, old; cur = atomic_read(&f->rr_cur); while ((old = atomic_cmpxchg(&f->rr_cur, cur, fanout_rr_next(f, num))) != cur) cur = old; return cur; } static unsigned int fanout_demux_cpu(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { return smp_processor_id() % num; } static unsigned int fanout_demux_rnd(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { return reciprocal_divide(prandom_u32(), num); } static unsigned int fanout_demux_rollover(struct packet_fanout *f, struct sk_buff *skb, unsigned int idx, unsigned int skip, unsigned int num) { unsigned int i, j; i = j = min_t(int, f->next[idx], num - 1); do { if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) { if (i != j) f->next[idx] = i; return i; } if (++i == num) i = 0; } while (i != j); return idx; } static bool fanout_has_flag(struct packet_fanout *f, u16 flag) { return f->flags & (flag >> 8); } static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct packet_fanout *f = pt->af_packet_priv; unsigned int num = f->num_members; struct packet_sock *po; unsigned int idx; if (!net_eq(dev_net(dev), read_pnet(&f->net)) || !num) { kfree_skb(skb); return 0; } switch (f->type) { case PACKET_FANOUT_HASH: default: if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) { skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET); if (!skb) return 0; } skb_get_rxhash(skb); idx = fanout_demux_hash(f, skb, num); break; case PACKET_FANOUT_LB: idx = fanout_demux_lb(f, skb, num); break; case PACKET_FANOUT_CPU: idx = fanout_demux_cpu(f, skb, num); break; case PACKET_FANOUT_RND: idx = fanout_demux_rnd(f, skb, num); break; case PACKET_FANOUT_ROLLOVER: idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num); break; } po = pkt_sk(f->arr[idx]); if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) && unlikely(!packet_rcv_has_room(po, skb))) { idx = fanout_demux_rollover(f, skb, idx, idx, num); po = pkt_sk(f->arr[idx]); } return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev); } DEFINE_MUTEX(fanout_mutex); EXPORT_SYMBOL_GPL(fanout_mutex); static LIST_HEAD(fanout_list); static void __fanout_link(struct sock *sk, struct packet_sock *po) { struct packet_fanout *f = po->fanout; spin_lock(&f->lock); f->arr[f->num_members] = sk; smp_wmb(); f->num_members++; spin_unlock(&f->lock); } static void __fanout_unlink(struct sock *sk, struct packet_sock *po) { struct packet_fanout *f = po->fanout; int i; spin_lock(&f->lock); for (i = 0; i < f->num_members; i++) { if (f->arr[i] == sk) break; } BUG_ON(i >= f->num_members); f->arr[i] = f->arr[f->num_members - 1]; f->num_members--; spin_unlock(&f->lock); } static bool match_fanout_group(struct packet_type *ptype, struct sock * sk) { if (ptype->af_packet_priv == (void*)((struct packet_sock *)sk)->fanout) return true; return false; } static int fanout_add(struct sock *sk, u16 id, u16 type_flags) { struct packet_sock *po = pkt_sk(sk); struct packet_fanout *f, *match; u8 type = type_flags & 0xff; u8 flags = type_flags >> 8; int err; switch (type) { case PACKET_FANOUT_ROLLOVER: if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER) return -EINVAL; case PACKET_FANOUT_HASH: case PACKET_FANOUT_LB: case PACKET_FANOUT_CPU: case PACKET_FANOUT_RND: break; default: return -EINVAL; } if (!po->running) return -EINVAL; if (po->fanout) return -EALREADY; mutex_lock(&fanout_mutex); match = NULL; list_for_each_entry(f, &fanout_list, list) { if (f->id == id && read_pnet(&f->net) == sock_net(sk)) { match = f; break; } } err = -EINVAL; if (match && match->flags != flags) goto out; if (!match) { err = -ENOMEM; match = kzalloc(sizeof(*match), GFP_KERNEL); if (!match) goto out; write_pnet(&match->net, sock_net(sk)); match->id = id; match->type = type; match->flags = flags; atomic_set(&match->rr_cur, 0); INIT_LIST_HEAD(&match->list); spin_lock_init(&match->lock); atomic_set(&match->sk_ref, 0); match->prot_hook.type = po->prot_hook.type; match->prot_hook.dev = po->prot_hook.dev; match->prot_hook.func = packet_rcv_fanout; match->prot_hook.af_packet_priv = match; match->prot_hook.id_match = match_fanout_group; dev_add_pack(&match->prot_hook); list_add(&match->list, &fanout_list); } err = -EINVAL; if (match->type == type && match->prot_hook.type == po->prot_hook.type && match->prot_hook.dev == po->prot_hook.dev) { err = -ENOSPC; if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) { __dev_remove_pack(&po->prot_hook); po->fanout = match; atomic_inc(&match->sk_ref); __fanout_link(sk, po); err = 0; } } out: mutex_unlock(&fanout_mutex); return err; } static void fanout_release(struct sock *sk) { struct packet_sock *po = pkt_sk(sk); struct packet_fanout *f; f = po->fanout; if (!f) return; mutex_lock(&fanout_mutex); po->fanout = NULL; if (atomic_dec_and_test(&f->sk_ref)) { list_del(&f->list); dev_remove_pack(&f->prot_hook); kfree(f); } mutex_unlock(&fanout_mutex); } static const struct proto_ops packet_ops; static const struct proto_ops packet_ops_spkt; static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct sock *sk; struct sockaddr_pkt *spkt; /* * When we registered the protocol we saved the socket in the data * field for just this event. */ sk = pt->af_packet_priv; /* * Yank back the headers [hope the device set this * right or kerboom...] * * Incoming packets have ll header pulled, * push it back. * * For outgoing ones skb->data == skb_mac_header(skb) * so that this procedure is noop. */ if (skb->pkt_type == PACKET_LOOPBACK) goto out; if (!net_eq(dev_net(dev), sock_net(sk))) goto out; skb = skb_share_check(skb, GFP_ATOMIC); if (skb == NULL) goto oom; /* drop any routing info */ skb_dst_drop(skb); /* drop conntrack reference */ nf_reset(skb); spkt = &PACKET_SKB_CB(skb)->sa.pkt; skb_push(skb, skb->data - skb_mac_header(skb)); /* * The SOCK_PACKET socket receives _all_ frames. */ spkt->spkt_family = dev->type; strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device)); spkt->spkt_protocol = skb->protocol; /* * Charge the memory to the socket. This is done specifically * to prevent sockets using all the memory up. */ if (sock_queue_rcv_skb(sk, skb) == 0) return 0; out: kfree_skb(skb); oom: return 0; } /* * Output a raw packet to a device layer. This bypasses all the other * protocol layers and you must therefore supply it with a complete frame */ static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name; struct sk_buff *skb = NULL; struct net_device *dev; __be16 proto = 0; int err; int extra_len = 0; /* * Get and verify the address. */ if (saddr) { if (msg->msg_namelen < sizeof(struct sockaddr)) return -EINVAL; if (msg->msg_namelen == sizeof(struct sockaddr_pkt)) proto = saddr->spkt_protocol; } else return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */ /* * Find the device first to size check it */ saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0; retry: rcu_read_lock(); dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device); err = -ENODEV; if (dev == NULL) goto out_unlock; err = -ENETDOWN; if (!(dev->flags & IFF_UP)) goto out_unlock; /* * You may not queue a frame bigger than the mtu. This is the lowest level * raw protocol and you must do your own fragmentation at this level. */ if (unlikely(sock_flag(sk, SOCK_NOFCS))) { if (!netif_supports_nofcs(dev)) { err = -EPROTONOSUPPORT; goto out_unlock; } extra_len = 4; /* We're doing our own CRC */ } err = -EMSGSIZE; if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len) goto out_unlock; if (!skb) { size_t reserved = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0; rcu_read_unlock(); skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL); if (skb == NULL) return -ENOBUFS; /* FIXME: Save some space for broken drivers that write a hard * header at transmission time by themselves. PPP is the notable * one here. This should really be fixed at the driver level. */ skb_reserve(skb, reserved); skb_reset_network_header(skb); /* Try to align data part correctly */ if (hhlen) { skb->data -= hhlen; skb->tail -= hhlen; if (len < hhlen) skb_reset_network_header(skb); } err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); if (err) goto out_free; goto retry; } if (len > (dev->mtu + dev->hard_header_len + extra_len)) { /* Earlier code assumed this would be a VLAN pkt, * double-check this now that we have the actual * packet in hand. */ struct ethhdr *ehdr; skb_reset_mac_header(skb); ehdr = eth_hdr(skb); if (ehdr->h_proto != htons(ETH_P_8021Q)) { err = -EMSGSIZE; goto out_unlock; } } skb->protocol = proto; skb->dev = dev; skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); if (unlikely(extra_len == 4)) skb->no_fcs = 1; skb_probe_transport_header(skb, 0); dev_queue_xmit(skb); rcu_read_unlock(); return len; out_unlock: rcu_read_unlock(); out_free: kfree_skb(skb); return err; } static unsigned int run_filter(const struct sk_buff *skb, const struct sock *sk, unsigned int res) { struct sk_filter *filter; rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter != NULL) res = SK_RUN_FILTER(filter, skb); rcu_read_unlock(); return res; } /* * This function makes lazy skb cloning in hope that most of packets * are discarded by BPF. * * Note tricky part: we DO mangle shared skb! skb->data, skb->len * and skb->cb are mangled. It works because (and until) packets * falling here are owned by current CPU. Output packets are cloned * by dev_queue_xmit_nit(), input packets are processed by net_bh * sequencially, so that if we return skb to original state on exit, * we will not harm anyone. */ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct sock *sk; struct sockaddr_ll *sll; struct packet_sock *po; u8 *skb_head = skb->data; int skb_len = skb->len; unsigned int snaplen, res; if (skb->pkt_type == PACKET_LOOPBACK) goto drop; sk = pt->af_packet_priv; po = pkt_sk(sk); if (!net_eq(dev_net(dev), sock_net(sk))) goto drop; skb->dev = dev; if (dev->header_ops) { /* The device has an explicit notion of ll header, * exported to higher levels. * * Otherwise, the device hides details of its frame * structure, so that corresponding packet head is * never delivered to user. */ if (sk->sk_type != SOCK_DGRAM) skb_push(skb, skb->data - skb_mac_header(skb)); else if (skb->pkt_type == PACKET_OUTGOING) { /* Special case: outgoing packets have ll header at head */ skb_pull(skb, skb_network_offset(skb)); } } snaplen = skb->len; res = run_filter(skb, sk, snaplen); if (!res) goto drop_n_restore; if (snaplen > res) snaplen = res; if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) goto drop_n_acct; if (skb_shared(skb)) { struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); if (nskb == NULL) goto drop_n_acct; if (skb_head != skb->data) { skb->data = skb_head; skb->len = skb_len; } consume_skb(skb); skb = nskb; } BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 > sizeof(skb->cb)); sll = &PACKET_SKB_CB(skb)->sa.ll; sll->sll_family = AF_PACKET; sll->sll_hatype = dev->type; sll->sll_protocol = skb->protocol; sll->sll_pkttype = skb->pkt_type; if (unlikely(po->origdev)) sll->sll_ifindex = orig_dev->ifindex; else sll->sll_ifindex = dev->ifindex; sll->sll_halen = dev_parse_header(skb, sll->sll_addr); PACKET_SKB_CB(skb)->origlen = skb->len; if (pskb_trim(skb, snaplen)) goto drop_n_acct; skb_set_owner_r(skb, sk); skb->dev = NULL; skb_dst_drop(skb); /* drop conntrack reference */ nf_reset(skb); spin_lock(&sk->sk_receive_queue.lock); po->stats.stats1.tp_packets++; skb->dropcount = atomic_read(&sk->sk_drops); __skb_queue_tail(&sk->sk_receive_queue, skb); spin_unlock(&sk->sk_receive_queue.lock); sk->sk_data_ready(sk, skb->len); return 0; drop_n_acct: spin_lock(&sk->sk_receive_queue.lock); po->stats.stats1.tp_drops++; atomic_inc(&sk->sk_drops); spin_unlock(&sk->sk_receive_queue.lock); drop_n_restore: if (skb_head != skb->data && skb_shared(skb)) { skb->data = skb_head; skb->len = skb_len; } drop: consume_skb(skb); return 0; } static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct sock *sk; struct packet_sock *po; struct sockaddr_ll *sll; union tpacket_uhdr h; u8 *skb_head = skb->data; int skb_len = skb->len; unsigned int snaplen, res; unsigned long status = TP_STATUS_USER; unsigned short macoff, netoff, hdrlen; struct sk_buff *copy_skb = NULL; struct timespec ts; __u32 ts_status; if (skb->pkt_type == PACKET_LOOPBACK) goto drop; sk = pt->af_packet_priv; po = pkt_sk(sk); if (!net_eq(dev_net(dev), sock_net(sk))) goto drop; if (dev->header_ops) { if (sk->sk_type != SOCK_DGRAM) skb_push(skb, skb->data - skb_mac_header(skb)); else if (skb->pkt_type == PACKET_OUTGOING) { /* Special case: outgoing packets have ll header at head */ skb_pull(skb, skb_network_offset(skb)); } } if (skb->ip_summed == CHECKSUM_PARTIAL) status |= TP_STATUS_CSUMNOTREADY; snaplen = skb->len; res = run_filter(skb, sk, snaplen); if (!res) goto drop_n_restore; if (snaplen > res) snaplen = res; if (sk->sk_type == SOCK_DGRAM) { macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 + po->tp_reserve; } else { unsigned int maclen = skb_network_offset(skb); netoff = TPACKET_ALIGN(po->tp_hdrlen + (maclen < 16 ? 16 : maclen)) + po->tp_reserve; macoff = netoff - maclen; } if (po->tp_version <= TPACKET_V2) { if (macoff + snaplen > po->rx_ring.frame_size) { if (po->copy_thresh && atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { if (skb_shared(skb)) { copy_skb = skb_clone(skb, GFP_ATOMIC); } else { copy_skb = skb_get(skb); skb_head = skb->data; } if (copy_skb) skb_set_owner_r(copy_skb, sk); } snaplen = po->rx_ring.frame_size - macoff; if ((int)snaplen < 0) snaplen = 0; } } spin_lock(&sk->sk_receive_queue.lock); h.raw = packet_current_rx_frame(po, skb, TP_STATUS_KERNEL, (macoff+snaplen)); if (!h.raw) goto ring_is_full; if (po->tp_version <= TPACKET_V2) { packet_increment_rx_head(po, &po->rx_ring); /* * LOSING will be reported till you read the stats, * because it's COR - Clear On Read. * Anyways, moving it for V1/V2 only as V3 doesn't need this * at packet level. */ if (po->stats.stats1.tp_drops) status |= TP_STATUS_LOSING; } po->stats.stats1.tp_packets++; if (copy_skb) { status |= TP_STATUS_COPY; __skb_queue_tail(&sk->sk_receive_queue, copy_skb); } spin_unlock(&sk->sk_receive_queue.lock); skb_copy_bits(skb, 0, h.raw + macoff, snaplen); if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp))) getnstimeofday(&ts); status |= ts_status; switch (po->tp_version) { case TPACKET_V1: h.h1->tp_len = skb->len; h.h1->tp_snaplen = snaplen; h.h1->tp_mac = macoff; h.h1->tp_net = netoff; h.h1->tp_sec = ts.tv_sec; h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; hdrlen = sizeof(*h.h1); break; case TPACKET_V2: h.h2->tp_len = skb->len; h.h2->tp_snaplen = snaplen; h.h2->tp_mac = macoff; h.h2->tp_net = netoff; h.h2->tp_sec = ts.tv_sec; h.h2->tp_nsec = ts.tv_nsec; if (vlan_tx_tag_present(skb)) { h.h2->tp_vlan_tci = vlan_tx_tag_get(skb); status |= TP_STATUS_VLAN_VALID; } else { h.h2->tp_vlan_tci = 0; } h.h2->tp_padding = 0; hdrlen = sizeof(*h.h2); break; case TPACKET_V3: /* tp_nxt_offset,vlan are already populated above. * So DONT clear those fields here */ h.h3->tp_status |= status; h.h3->tp_len = skb->len; h.h3->tp_snaplen = snaplen; h.h3->tp_mac = macoff; h.h3->tp_net = netoff; h.h3->tp_sec = ts.tv_sec; h.h3->tp_nsec = ts.tv_nsec; hdrlen = sizeof(*h.h3); break; default: BUG(); } sll = h.raw + TPACKET_ALIGN(hdrlen); sll->sll_halen = dev_parse_header(skb, sll->sll_addr); sll->sll_family = AF_PACKET; sll->sll_hatype = dev->type; sll->sll_protocol = skb->protocol; sll->sll_pkttype = skb->pkt_type; if (unlikely(po->origdev)) sll->sll_ifindex = orig_dev->ifindex; else sll->sll_ifindex = dev->ifindex; smp_mb(); #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 { u8 *start, *end; if (po->tp_version <= TPACKET_V2) { end = (u8 *)PAGE_ALIGN((unsigned long)h.raw + macoff + snaplen); for (start = h.raw; start < end; start += PAGE_SIZE) flush_dcache_page(pgv_to_page(start)); } smp_wmb(); } #endif if (po->tp_version <= TPACKET_V2) __packet_set_status(po, h.raw, status); else prb_clear_blk_fill_status(&po->rx_ring); sk->sk_data_ready(sk, 0); drop_n_restore: if (skb_head != skb->data && skb_shared(skb)) { skb->data = skb_head; skb->len = skb_len; } drop: kfree_skb(skb); return 0; ring_is_full: po->stats.stats1.tp_drops++; spin_unlock(&sk->sk_receive_queue.lock); sk->sk_data_ready(sk, 0); kfree_skb(copy_skb); goto drop_n_restore; } static void tpacket_destruct_skb(struct sk_buff *skb) { struct packet_sock *po = pkt_sk(skb->sk); void *ph; if (likely(po->tx_ring.pg_vec)) { __u32 ts; ph = skb_shinfo(skb)->destructor_arg; BUG_ON(atomic_read(&po->tx_ring.pending) == 0); atomic_dec(&po->tx_ring.pending); ts = __packet_set_timestamp(po, ph, skb); __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts); } sock_wfree(skb); } static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, void *frame, struct net_device *dev, int size_max, __be16 proto, unsigned char *addr, int hlen) { union tpacket_uhdr ph; int to_write, offset, len, tp_len, nr_frags, len_max; struct socket *sock = po->sk.sk_socket; struct page *page; void *data; int err; ph.raw = frame; skb->protocol = proto; skb->dev = dev; skb->priority = po->sk.sk_priority; skb->mark = po->sk.sk_mark; sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags); skb_shinfo(skb)->destructor_arg = ph.raw; switch (po->tp_version) { case TPACKET_V2: tp_len = ph.h2->tp_len; break; default: tp_len = ph.h1->tp_len; break; } if (unlikely(tp_len > size_max)) { pr_err("packet size is too long (%d > %d)\n", tp_len, size_max); return -EMSGSIZE; } skb_reserve(skb, hlen); skb_reset_network_header(skb); skb_probe_transport_header(skb, 0); if (po->tp_tx_has_off) { int off_min, off_max, off; off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll); off_max = po->tx_ring.frame_size - tp_len; if (sock->type == SOCK_DGRAM) { switch (po->tp_version) { case TPACKET_V2: off = ph.h2->tp_net; break; default: off = ph.h1->tp_net; break; } } else { switch (po->tp_version) { case TPACKET_V2: off = ph.h2->tp_mac; break; default: off = ph.h1->tp_mac; break; } } if (unlikely((off < off_min) || (off_max < off))) return -EINVAL; data = ph.raw + off; } else { data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll); } to_write = tp_len; if (sock->type == SOCK_DGRAM) { err = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, tp_len); if (unlikely(err < 0)) return -EINVAL; } else if (dev->hard_header_len) { /* net device doesn't like empty head */ if (unlikely(tp_len <= dev->hard_header_len)) { pr_err("packet size is too short (%d < %d)\n", tp_len, dev->hard_header_len); return -EINVAL; } skb_push(skb, dev->hard_header_len); err = skb_store_bits(skb, 0, data, dev->hard_header_len); if (unlikely(err)) return err; data += dev->hard_header_len; to_write -= dev->hard_header_len; } offset = offset_in_page(data); len_max = PAGE_SIZE - offset; len = ((to_write > len_max) ? len_max : to_write); skb->data_len = to_write; skb->len += to_write; skb->truesize += to_write; atomic_add(to_write, &po->sk.sk_wmem_alloc); while (likely(to_write)) { nr_frags = skb_shinfo(skb)->nr_frags; if (unlikely(nr_frags >= MAX_SKB_FRAGS)) { pr_err("Packet exceed the number of skb frags(%lu)\n", MAX_SKB_FRAGS); return -EFAULT; } page = pgv_to_page(data); data += len; flush_dcache_page(page); get_page(page); skb_fill_page_desc(skb, nr_frags, page, offset, len); to_write -= len; offset = 0; len_max = PAGE_SIZE; len = ((to_write > len_max) ? len_max : to_write); } return tp_len; } static struct net_device *packet_cached_dev_get(struct packet_sock *po) { struct net_device *dev; rcu_read_lock(); dev = rcu_dereference(po->cached_dev); if (dev) dev_hold(dev); rcu_read_unlock(); return dev; } static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) { struct sk_buff *skb; struct net_device *dev; __be16 proto; int err, reserve = 0; void *ph; struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name; int tp_len, size_max; unsigned char *addr; int len_sum = 0; int status = TP_STATUS_AVAILABLE; int hlen, tlen; mutex_lock(&po->pg_vec_lock); if (saddr == NULL) { dev = packet_cached_dev_get(po); proto = po->num; addr = NULL; } else { err = -EINVAL; if (msg->msg_namelen < sizeof(struct sockaddr_ll)) goto out; if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr))) goto out; proto = saddr->sll_protocol; addr = saddr->sll_addr; dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex); } err = -ENXIO; if (unlikely(dev == NULL)) goto out; err = -ENETDOWN; if (unlikely(!(dev->flags & IFF_UP))) goto out_put; reserve = dev->hard_header_len; size_max = po->tx_ring.frame_size - (po->tp_hdrlen - sizeof(struct sockaddr_ll)); if (size_max > dev->mtu + reserve) size_max = dev->mtu + reserve; do { ph = packet_current_frame(po, &po->tx_ring, TP_STATUS_SEND_REQUEST); if (unlikely(ph == NULL)) { schedule(); continue; } status = TP_STATUS_SEND_REQUEST; hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; skb = sock_alloc_send_skb(&po->sk, hlen + tlen + sizeof(struct sockaddr_ll), 0, &err); if (unlikely(skb == NULL)) goto out_status; tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto, addr, hlen); if (unlikely(tp_len < 0)) { if (po->tp_loss) { __packet_set_status(po, ph, TP_STATUS_AVAILABLE); packet_increment_head(&po->tx_ring); kfree_skb(skb); continue; } else { status = TP_STATUS_WRONG_FORMAT; err = tp_len; goto out_status; } } skb->destructor = tpacket_destruct_skb; __packet_set_status(po, ph, TP_STATUS_SENDING); atomic_inc(&po->tx_ring.pending); status = TP_STATUS_SEND_REQUEST; err = dev_queue_xmit(skb); if (unlikely(err > 0)) { err = net_xmit_errno(err); if (err && __packet_get_status(po, ph) == TP_STATUS_AVAILABLE) { /* skb was destructed already */ skb = NULL; goto out_status; } /* * skb was dropped but not destructed yet; * let's treat it like congestion or err < 0 */ err = 0; } packet_increment_head(&po->tx_ring); len_sum += tp_len; } while (likely((ph != NULL) || ((!(msg->msg_flags & MSG_DONTWAIT)) && (atomic_read(&po->tx_ring.pending)))) ); err = len_sum; goto out_put; out_status: __packet_set_status(po, ph, status); kfree_skb(skb); out_put: dev_put(dev); out: mutex_unlock(&po->pg_vec_lock); return err; } static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, size_t reserve, size_t len, size_t linear, int noblock, int *err) { struct sk_buff *skb; /* Under a page? Don't bother with paged skb. */ if (prepad + len < PAGE_SIZE || !linear) linear = len; skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, err, 0); if (!skb) return NULL; skb_reserve(skb, reserve); skb_put(skb, linear); skb->data_len = len - linear; skb->len += len - linear; return skb; } static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name; struct sk_buff *skb; struct net_device *dev; __be16 proto; unsigned char *addr; int err, reserve = 0; struct virtio_net_hdr vnet_hdr = { 0 }; int offset = 0; int vnet_hdr_len; struct packet_sock *po = pkt_sk(sk); unsigned short gso_type = 0; int hlen, tlen; int extra_len = 0; /* * Get and verify the address. */ if (saddr == NULL) { dev = packet_cached_dev_get(po); proto = po->num; addr = NULL; } else { err = -EINVAL; if (msg->msg_namelen < sizeof(struct sockaddr_ll)) goto out; if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr))) goto out; proto = saddr->sll_protocol; addr = saddr->sll_addr; dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex); } err = -ENXIO; if (unlikely(dev == NULL)) goto out_unlock; err = -ENETDOWN; if (unlikely(!(dev->flags & IFF_UP))) goto out_unlock; if (sock->type == SOCK_RAW) reserve = dev->hard_header_len; if (po->has_vnet_hdr) { vnet_hdr_len = sizeof(vnet_hdr); err = -EINVAL; if (len < vnet_hdr_len) goto out_unlock; len -= vnet_hdr_len; err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov, vnet_hdr_len); if (err < 0) goto out_unlock; if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 > vnet_hdr.hdr_len)) vnet_hdr.hdr_len = vnet_hdr.csum_start + vnet_hdr.csum_offset + 2; err = -EINVAL; if (vnet_hdr.hdr_len > len) goto out_unlock; if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) { switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { case VIRTIO_NET_HDR_GSO_TCPV4: gso_type = SKB_GSO_TCPV4; break; case VIRTIO_NET_HDR_GSO_TCPV6: gso_type = SKB_GSO_TCPV6; break; case VIRTIO_NET_HDR_GSO_UDP: gso_type = SKB_GSO_UDP; break; default: goto out_unlock; } if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN) gso_type |= SKB_GSO_TCP_ECN; if (vnet_hdr.gso_size == 0) goto out_unlock; } } if (unlikely(sock_flag(sk, SOCK_NOFCS))) { if (!netif_supports_nofcs(dev)) { err = -EPROTONOSUPPORT; goto out_unlock; } extra_len = 4; /* We're doing our own CRC */ } err = -EMSGSIZE; if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len)) goto out_unlock; err = -ENOBUFS; hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, vnet_hdr.hdr_len, msg->msg_flags & MSG_DONTWAIT, &err); if (skb == NULL) goto out_unlock; skb_set_network_header(skb, reserve); err = -EINVAL; if (sock->type == SOCK_DGRAM && (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0) goto out_free; /* Returns -EFAULT on error */ err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len); if (err) goto out_free; sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); if (!gso_type && (len > dev->mtu + reserve + extra_len)) { /* Earlier code assumed this would be a VLAN pkt, * double-check this now that we have the actual * packet in hand. */ struct ethhdr *ehdr; skb_reset_mac_header(skb); ehdr = eth_hdr(skb); if (ehdr->h_proto != htons(ETH_P_8021Q)) { err = -EMSGSIZE; goto out_free; } } skb->protocol = proto; skb->dev = dev; skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; if (po->has_vnet_hdr) { if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { if (!skb_partial_csum_set(skb, vnet_hdr.csum_start, vnet_hdr.csum_offset)) { err = -EINVAL; goto out_free; } } skb_shinfo(skb)->gso_size = vnet_hdr.gso_size; skb_shinfo(skb)->gso_type = gso_type; /* Header must be checked, and gso_segs computed. */ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; skb_shinfo(skb)->gso_segs = 0; len += vnet_hdr_len; } skb_probe_transport_header(skb, reserve); if (unlikely(extra_len == 4)) skb->no_fcs = 1; /* * Now send it */ err = dev_queue_xmit(skb); if (err > 0 && (err = net_xmit_errno(err)) != 0) goto out_unlock; dev_put(dev); return len; out_free: kfree_skb(skb); out_unlock: if (dev) dev_put(dev); out: return err; } static int packet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); if (po->tx_ring.pg_vec) return tpacket_snd(po, msg); else return packet_snd(sock, msg, len); } /* * Close a PACKET socket. This is fairly simple. We immediately go * to 'closed' state and remove our protocol entry in the device list. */ static int packet_release(struct socket *sock) { struct sock *sk = sock->sk; struct packet_sock *po; struct net *net; union tpacket_req_u req_u; if (!sk) return 0; net = sock_net(sk); po = pkt_sk(sk); mutex_lock(&net->packet.sklist_lock); sk_del_node_init_rcu(sk); mutex_unlock(&net->packet.sklist_lock); preempt_disable(); sock_prot_inuse_add(net, sk->sk_prot, -1); preempt_enable(); spin_lock(&po->bind_lock); unregister_prot_hook(sk, false); if (po->prot_hook.dev) { dev_put(po->prot_hook.dev); po->prot_hook.dev = NULL; } spin_unlock(&po->bind_lock); packet_flush_mclist(sk); if (po->rx_ring.pg_vec) { memset(&req_u, 0, sizeof(req_u)); packet_set_ring(sk, &req_u, 1, 0); } if (po->tx_ring.pg_vec) { memset(&req_u, 0, sizeof(req_u)); packet_set_ring(sk, &req_u, 1, 1); } fanout_release(sk); synchronize_net(); /* * Now the socket is dead. No more input will appear. */ sock_orphan(sk); sock->sk = NULL; /* Purge queues */ skb_queue_purge(&sk->sk_receive_queue); sk_refcnt_debug_release(sk); sock_put(sk); return 0; } /* * Attach a packet hook. */ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol) { struct packet_sock *po = pkt_sk(sk); if (po->fanout) { if (dev) dev_put(dev); return -EINVAL; } lock_sock(sk); spin_lock(&po->bind_lock); unregister_prot_hook(sk, true); po->num = protocol; po->prot_hook.type = protocol; if (po->prot_hook.dev) dev_put(po->prot_hook.dev); po->prot_hook.dev = dev; po->ifindex = dev ? dev->ifindex : 0; if (protocol == 0) goto out_unlock; if (!dev || (dev->flags & IFF_UP)) { register_prot_hook(sk); } else { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk->sk_error_report(sk); } out_unlock: spin_unlock(&po->bind_lock); release_sock(sk); return 0; } /* * Bind a packet socket to a device */ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; char name[15]; struct net_device *dev; int err = -ENODEV; /* * Check legality */ if (addr_len != sizeof(struct sockaddr)) return -EINVAL; strlcpy(name, uaddr->sa_data, sizeof(name)); dev = dev_get_by_name(sock_net(sk), name); if (dev) err = packet_do_bind(sk, dev, pkt_sk(sk)->num); return err; } static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr; struct sock *sk = sock->sk; struct net_device *dev = NULL; int err; /* * Check legality */ if (addr_len < sizeof(struct sockaddr_ll)) return -EINVAL; if (sll->sll_family != AF_PACKET) return -EINVAL; if (sll->sll_ifindex) { err = -ENODEV; dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex); if (dev == NULL) goto out; } err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num); out: return err; } static struct proto packet_proto = { .name = "PACKET", .owner = THIS_MODULE, .obj_size = sizeof(struct packet_sock), }; /* * Create a packet of type SOCK_PACKET. */ static int packet_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct packet_sock *po; __be16 proto = (__force __be16)protocol; /* weird, but documented */ int err; if (!ns_capable(net->user_ns, CAP_NET_RAW)) return -EPERM; if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW && sock->type != SOCK_PACKET) return -ESOCKTNOSUPPORT; sock->state = SS_UNCONNECTED; err = -ENOBUFS; sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto); if (sk == NULL) goto out; sock->ops = &packet_ops; if (sock->type == SOCK_PACKET) sock->ops = &packet_ops_spkt; sock_init_data(sock, sk); po = pkt_sk(sk); sk->sk_family = PF_PACKET; po->num = proto; RCU_INIT_POINTER(po->cached_dev, NULL); sk->sk_destruct = packet_sock_destruct; sk_refcnt_debug_inc(sk); /* * Attach a protocol block */ spin_lock_init(&po->bind_lock); mutex_init(&po->pg_vec_lock); po->prot_hook.func = packet_rcv; if (sock->type == SOCK_PACKET) po->prot_hook.func = packet_rcv_spkt; po->prot_hook.af_packet_priv = sk; if (proto) { po->prot_hook.type = proto; register_prot_hook(sk); } mutex_lock(&net->packet.sklist_lock); sk_add_node_rcu(sk, &net->packet.sklist); mutex_unlock(&net->packet.sklist_lock); preempt_disable(); sock_prot_inuse_add(net, &packet_proto, 1); preempt_enable(); return 0; out: return err; } /* * Pull a packet from our receive queue and hand it to the user. * If necessary we block. */ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; int copied, err; int vnet_hdr_len = 0; err = -EINVAL; if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE)) goto out; #if 0 /* What error should we return now? EUNATTACH? */ if (pkt_sk(sk)->ifindex < 0) return -ENODEV; #endif if (flags & MSG_ERRQUEUE) { err = sock_recv_errqueue(sk, msg, len, SOL_PACKET, PACKET_TX_TIMESTAMP); goto out; } /* * Call the generic datagram receiver. This handles all sorts * of horrible races and re-entrancy so we can forget about it * in the protocol layers. * * Now it will return ENETDOWN, if device have just gone down, * but then it will block. */ skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err); /* * An error occurred so return it. Because skb_recv_datagram() * handles the blocking we don't see and worry about blocking * retries. */ if (skb == NULL) goto out; if (pkt_sk(sk)->has_vnet_hdr) { struct virtio_net_hdr vnet_hdr = { 0 }; err = -EINVAL; vnet_hdr_len = sizeof(vnet_hdr); if (len < vnet_hdr_len) goto out_free; len -= vnet_hdr_len; if (skb_is_gso(skb)) { struct skb_shared_info *sinfo = skb_shinfo(skb); /* This is a hint as to how much should be linear. */ vnet_hdr.hdr_len = skb_headlen(skb); vnet_hdr.gso_size = sinfo->gso_size; if (sinfo->gso_type & SKB_GSO_TCPV4) vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4; else if (sinfo->gso_type & SKB_GSO_TCPV6) vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6; else if (sinfo->gso_type & SKB_GSO_UDP) vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP; else if (sinfo->gso_type & SKB_GSO_FCOE) goto out_free; else BUG(); if (sinfo->gso_type & SKB_GSO_TCP_ECN) vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN; } else vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE; if (skb->ip_summed == CHECKSUM_PARTIAL) { vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; vnet_hdr.csum_start = skb_checksum_start_offset(skb); vnet_hdr.csum_offset = skb->csum_offset; } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) { vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID; } /* else everything is zero */ err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr, vnet_hdr_len); if (err < 0) goto out_free; } /* You lose any data beyond the buffer you gave. If it worries * a user program they can ask the device for its MTU * anyway. */ copied = skb->len; if (copied > len) { copied = len; msg->msg_flags |= MSG_TRUNC; } err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); if (err) goto out_free; sock_recv_ts_and_drops(msg, sk, skb); if (msg->msg_name) { /* If the address length field is there to be filled * in, we fill it in now. */ if (sock->type == SOCK_PACKET) { msg->msg_namelen = sizeof(struct sockaddr_pkt); } else { struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr); } memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, msg->msg_namelen); } if (pkt_sk(sk)->auxdata) { struct tpacket_auxdata aux; aux.tp_status = TP_STATUS_USER; if (skb->ip_summed == CHECKSUM_PARTIAL) aux.tp_status |= TP_STATUS_CSUMNOTREADY; aux.tp_len = PACKET_SKB_CB(skb)->origlen; aux.tp_snaplen = skb->len; aux.tp_mac = 0; aux.tp_net = skb_network_offset(skb); if (vlan_tx_tag_present(skb)) { aux.tp_vlan_tci = vlan_tx_tag_get(skb); aux.tp_status |= TP_STATUS_VLAN_VALID; } else { aux.tp_vlan_tci = 0; } aux.tp_padding = 0; put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux); } /* * Free or return the buffer as appropriate. Again this * hides all the races and re-entrancy issues from us. */ err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied); out_free: skb_free_datagram(sk, skb); out: return err; } static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer) { struct net_device *dev; struct sock *sk = sock->sk; if (peer) return -EOPNOTSUPP; uaddr->sa_family = AF_PACKET; memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data)); rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex); if (dev) strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data)); rcu_read_unlock(); *uaddr_len = sizeof(*uaddr); return 0; } static int packet_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer) { struct net_device *dev; struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr); if (peer) return -EOPNOTSUPP; sll->sll_family = AF_PACKET; sll->sll_ifindex = po->ifindex; sll->sll_protocol = po->num; sll->sll_pkttype = 0; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex); if (dev) { sll->sll_hatype = dev->type; sll->sll_halen = dev->addr_len; memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len); } else { sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */ sll->sll_halen = 0; } rcu_read_unlock(); *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen; return 0; } static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what) { switch (i->type) { case PACKET_MR_MULTICAST: if (i->alen != dev->addr_len) return -EINVAL; if (what > 0) return dev_mc_add(dev, i->addr); else return dev_mc_del(dev, i->addr); break; case PACKET_MR_PROMISC: return dev_set_promiscuity(dev, what); break; case PACKET_MR_ALLMULTI: return dev_set_allmulti(dev, what); break; case PACKET_MR_UNICAST: if (i->alen != dev->addr_len) return -EINVAL; if (what > 0) return dev_uc_add(dev, i->addr); else return dev_uc_del(dev, i->addr); break; default: break; } return 0; } static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what) { for ( ; i; i = i->next) { if (i->ifindex == dev->ifindex) packet_dev_mc(dev, i, what); } } static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq) { struct packet_sock *po = pkt_sk(sk); struct packet_mclist *ml, *i; struct net_device *dev; int err; rtnl_lock(); err = -ENODEV; dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex); if (!dev) goto done; err = -EINVAL; if (mreq->mr_alen > dev->addr_len) goto done; err = -ENOBUFS; i = kmalloc(sizeof(*i), GFP_KERNEL); if (i == NULL) goto done; err = 0; for (ml = po->mclist; ml; ml = ml->next) { if (ml->ifindex == mreq->mr_ifindex && ml->type == mreq->mr_type && ml->alen == mreq->mr_alen && memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) { ml->count++; /* Free the new element ... */ kfree(i); goto done; } } i->type = mreq->mr_type; i->ifindex = mreq->mr_ifindex; i->alen = mreq->mr_alen; memcpy(i->addr, mreq->mr_address, i->alen); i->count = 1; i->next = po->mclist; po->mclist = i; err = packet_dev_mc(dev, i, 1); if (err) { po->mclist = i->next; kfree(i); } done: rtnl_unlock(); return err; } static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq) { struct packet_mclist *ml, **mlp; rtnl_lock(); for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) { if (ml->ifindex == mreq->mr_ifindex && ml->type == mreq->mr_type && ml->alen == mreq->mr_alen && memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) { if (--ml->count == 0) { struct net_device *dev; *mlp = ml->next; dev = __dev_get_by_index(sock_net(sk), ml->ifindex); if (dev) packet_dev_mc(dev, ml, -1); kfree(ml); } rtnl_unlock(); return 0; } } rtnl_unlock(); return -EADDRNOTAVAIL; } static void packet_flush_mclist(struct sock *sk) { struct packet_sock *po = pkt_sk(sk); struct packet_mclist *ml; if (!po->mclist) return; rtnl_lock(); while ((ml = po->mclist) != NULL) { struct net_device *dev; po->mclist = ml->next; dev = __dev_get_by_index(sock_net(sk), ml->ifindex); if (dev != NULL) packet_dev_mc(dev, ml, -1); kfree(ml); } rtnl_unlock(); } static int packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); int ret; if (level != SOL_PACKET) return -ENOPROTOOPT; switch (optname) { case PACKET_ADD_MEMBERSHIP: case PACKET_DROP_MEMBERSHIP: { struct packet_mreq_max mreq; int len = optlen; memset(&mreq, 0, sizeof(mreq)); if (len < sizeof(struct packet_mreq)) return -EINVAL; if (len > sizeof(mreq)) len = sizeof(mreq); if (copy_from_user(&mreq, optval, len)) return -EFAULT; if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address))) return -EINVAL; if (optname == PACKET_ADD_MEMBERSHIP) ret = packet_mc_add(sk, &mreq); else ret = packet_mc_drop(sk, &mreq); return ret; } case PACKET_RX_RING: case PACKET_TX_RING: { union tpacket_req_u req_u; int len; switch (po->tp_version) { case TPACKET_V1: case TPACKET_V2: len = sizeof(req_u.req); break; case TPACKET_V3: default: len = sizeof(req_u.req3); break; } if (optlen < len) return -EINVAL; if (pkt_sk(sk)->has_vnet_hdr) return -EINVAL; if (copy_from_user(&req_u.req, optval, len)) return -EFAULT; return packet_set_ring(sk, &req_u, 0, optname == PACKET_TX_RING); } case PACKET_COPY_THRESH: { int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; pkt_sk(sk)->copy_thresh = val; return 0; } case PACKET_VERSION: { int val; if (optlen != sizeof(val)) return -EINVAL; if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) return -EBUSY; if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; switch (val) { case TPACKET_V1: case TPACKET_V2: case TPACKET_V3: po->tp_version = val; return 0; default: return -EINVAL; } } case PACKET_RESERVE: { unsigned int val; if (optlen != sizeof(val)) return -EINVAL; if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) return -EBUSY; if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; po->tp_reserve = val; return 0; } case PACKET_LOSS: { unsigned int val; if (optlen != sizeof(val)) return -EINVAL; if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) return -EBUSY; if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; po->tp_loss = !!val; return 0; } case PACKET_AUXDATA: { int val; if (optlen < sizeof(val)) return -EINVAL; if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; po->auxdata = !!val; return 0; } case PACKET_ORIGDEV: { int val; if (optlen < sizeof(val)) return -EINVAL; if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; po->origdev = !!val; return 0; } case PACKET_VNET_HDR: { int val; if (sock->type != SOCK_RAW) return -EINVAL; if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) return -EBUSY; if (optlen < sizeof(val)) return -EINVAL; if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; po->has_vnet_hdr = !!val; return 0; } case PACKET_TIMESTAMP: { int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; po->tp_tstamp = val; return 0; } case PACKET_FANOUT: { int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; return fanout_add(sk, val & 0xffff, val >> 16); } case PACKET_TX_HAS_OFF: { unsigned int val; if (optlen != sizeof(val)) return -EINVAL; if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) return -EBUSY; if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; po->tp_tx_has_off = !!val; return 0; } default: return -ENOPROTOOPT; } } static int packet_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { int len; int val, lv = sizeof(val); struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); void *data = &val; union tpacket_stats_u st; if (level != SOL_PACKET) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; switch (optname) { case PACKET_STATISTICS: spin_lock_bh(&sk->sk_receive_queue.lock); memcpy(&st, &po->stats, sizeof(st)); memset(&po->stats, 0, sizeof(po->stats)); spin_unlock_bh(&sk->sk_receive_queue.lock); if (po->tp_version == TPACKET_V3) { lv = sizeof(struct tpacket_stats_v3); st.stats3.tp_packets += st.stats3.tp_drops; data = &st.stats3; } else { lv = sizeof(struct tpacket_stats); st.stats1.tp_packets += st.stats1.tp_drops; data = &st.stats1; } break; case PACKET_AUXDATA: val = po->auxdata; break; case PACKET_ORIGDEV: val = po->origdev; break; case PACKET_VNET_HDR: val = po->has_vnet_hdr; break; case PACKET_VERSION: val = po->tp_version; break; case PACKET_HDRLEN: if (len > sizeof(int)) len = sizeof(int); if (copy_from_user(&val, optval, len)) return -EFAULT; switch (val) { case TPACKET_V1: val = sizeof(struct tpacket_hdr); break; case TPACKET_V2: val = sizeof(struct tpacket2_hdr); break; case TPACKET_V3: val = sizeof(struct tpacket3_hdr); break; default: return -EINVAL; } break; case PACKET_RESERVE: val = po->tp_reserve; break; case PACKET_LOSS: val = po->tp_loss; break; case PACKET_TIMESTAMP: val = po->tp_tstamp; break; case PACKET_FANOUT: val = (po->fanout ? ((u32)po->fanout->id | ((u32)po->fanout->type << 16) | ((u32)po->fanout->flags << 24)) : 0); break; case PACKET_TX_HAS_OFF: val = po->tp_tx_has_off; break; default: return -ENOPROTOOPT; } if (len > lv) len = lv; if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, data, len)) return -EFAULT; return 0; } static int packet_notifier(struct notifier_block *this, unsigned long msg, void *ptr) { struct sock *sk; struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); rcu_read_lock(); sk_for_each_rcu(sk, &net->packet.sklist) { struct packet_sock *po = pkt_sk(sk); switch (msg) { case NETDEV_UNREGISTER: if (po->mclist) packet_dev_mclist(dev, po->mclist, -1); /* fallthrough */ case NETDEV_DOWN: if (dev->ifindex == po->ifindex) { spin_lock(&po->bind_lock); if (po->running) { __unregister_prot_hook(sk, false); sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk->sk_error_report(sk); } if (msg == NETDEV_UNREGISTER) { po->ifindex = -1; if (po->prot_hook.dev) dev_put(po->prot_hook.dev); po->prot_hook.dev = NULL; } spin_unlock(&po->bind_lock); } break; case NETDEV_UP: if (dev->ifindex == po->ifindex) { spin_lock(&po->bind_lock); if (po->num) register_prot_hook(sk); spin_unlock(&po->bind_lock); } break; } } rcu_read_unlock(); return NOTIFY_DONE; } static int packet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; switch (cmd) { case SIOCOUTQ: { int amount = sk_wmem_alloc_get(sk); return put_user(amount, (int __user *)arg); } case SIOCINQ: { struct sk_buff *skb; int amount = 0; spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb) amount = skb->len; spin_unlock_bh(&sk->sk_receive_queue.lock); return put_user(amount, (int __user *)arg); } case SIOCGSTAMP: return sock_get_timestamp(sk, (struct timeval __user *)arg); case SIOCGSTAMPNS: return sock_get_timestampns(sk, (struct timespec __user *)arg); #ifdef CONFIG_INET case SIOCADDRT: case SIOCDELRT: case SIOCDARP: case SIOCGARP: case SIOCSARP: case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCSIFFLAGS: return inet_dgram_ops.ioctl(sock, cmd, arg); #endif default: return -ENOIOCTLCMD; } return 0; } static unsigned int packet_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); unsigned int mask = datagram_poll(file, sock, wait); spin_lock_bh(&sk->sk_receive_queue.lock); if (po->rx_ring.pg_vec) { if (!packet_previous_rx_frame(po, &po->rx_ring, TP_STATUS_KERNEL)) mask |= POLLIN | POLLRDNORM; } spin_unlock_bh(&sk->sk_receive_queue.lock); spin_lock_bh(&sk->sk_write_queue.lock); if (po->tx_ring.pg_vec) { if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE)) mask |= POLLOUT | POLLWRNORM; } spin_unlock_bh(&sk->sk_write_queue.lock); return mask; } /* Dirty? Well, I still did not learn better way to account * for user mmaps. */ static void packet_mm_open(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct socket *sock = file->private_data; struct sock *sk = sock->sk; if (sk) atomic_inc(&pkt_sk(sk)->mapped); } static void packet_mm_close(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct socket *sock = file->private_data; struct sock *sk = sock->sk; if (sk) atomic_dec(&pkt_sk(sk)->mapped); } static const struct vm_operations_struct packet_mmap_ops = { .open = packet_mm_open, .close = packet_mm_close, }; static void free_pg_vec(struct pgv *pg_vec, unsigned int order, unsigned int len) { int i; for (i = 0; i < len; i++) { if (likely(pg_vec[i].buffer)) { if (is_vmalloc_addr(pg_vec[i].buffer)) vfree(pg_vec[i].buffer); else free_pages((unsigned long)pg_vec[i].buffer, order); pg_vec[i].buffer = NULL; } } kfree(pg_vec); } static char *alloc_one_pg_vec_page(unsigned long order) { char *buffer = NULL; gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; buffer = (char *) __get_free_pages(gfp_flags, order); if (buffer) return buffer; /* * __get_free_pages failed, fall back to vmalloc */ buffer = vzalloc((1 << order) * PAGE_SIZE); if (buffer) return buffer; /* * vmalloc failed, lets dig into swap here */ gfp_flags &= ~__GFP_NORETRY; buffer = (char *)__get_free_pages(gfp_flags, order); if (buffer) return buffer; /* * complete and utter failure */ return NULL; } static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order) { unsigned int block_nr = req->tp_block_nr; struct pgv *pg_vec; int i; pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL); if (unlikely(!pg_vec)) goto out; for (i = 0; i < block_nr; i++) { pg_vec[i].buffer = alloc_one_pg_vec_page(order); if (unlikely(!pg_vec[i].buffer)) goto out_free_pgvec; } out: return pg_vec; out_free_pgvec: free_pg_vec(pg_vec, order, block_nr); pg_vec = NULL; goto out; } static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, int closing, int tx_ring) { struct pgv *pg_vec = NULL; struct packet_sock *po = pkt_sk(sk); int was_running, order = 0; struct packet_ring_buffer *rb; struct sk_buff_head *rb_queue; __be16 num; int err = -EINVAL; /* Added to avoid minimal code churn */ struct tpacket_req *req = &req_u->req; /* Opening a Tx-ring is NOT supported in TPACKET_V3 */ if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) { WARN(1, "Tx-ring is not supported.\n"); goto out; } rb = tx_ring ? &po->tx_ring : &po->rx_ring; rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; err = -EBUSY; if (!closing) { if (atomic_read(&po->mapped)) goto out; if (atomic_read(&rb->pending)) goto out; } if (req->tp_block_nr) { /* Sanity tests and some calculations */ err = -EBUSY; if (unlikely(rb->pg_vec)) goto out; switch (po->tp_version) { case TPACKET_V1: po->tp_hdrlen = TPACKET_HDRLEN; break; case TPACKET_V2: po->tp_hdrlen = TPACKET2_HDRLEN; break; case TPACKET_V3: po->tp_hdrlen = TPACKET3_HDRLEN; break; } err = -EINVAL; if (unlikely((int)req->tp_block_size <= 0)) goto out; if (unlikely(req->tp_block_size & (PAGE_SIZE - 1))) goto out; if (unlikely(req->tp_frame_size < po->tp_hdrlen + po->tp_reserve)) goto out; if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1))) goto out; rb->frames_per_block = req->tp_block_size/req->tp_frame_size; if (unlikely(rb->frames_per_block <= 0)) goto out; if (unlikely((rb->frames_per_block * req->tp_block_nr) != req->tp_frame_nr)) goto out; err = -ENOMEM; order = get_order(req->tp_block_size); pg_vec = alloc_pg_vec(req, order); if (unlikely(!pg_vec)) goto out; switch (po->tp_version) { case TPACKET_V3: /* Transmit path is not supported. We checked * it above but just being paranoid */ if (!tx_ring) init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring); break; default: break; } } /* Done */ else { err = -EINVAL; if (unlikely(req->tp_frame_nr)) goto out; } lock_sock(sk); /* Detach socket from network */ spin_lock(&po->bind_lock); was_running = po->running; num = po->num; if (was_running) { po->num = 0; __unregister_prot_hook(sk, false); } spin_unlock(&po->bind_lock); synchronize_net(); err = -EBUSY; mutex_lock(&po->pg_vec_lock); if (closing || atomic_read(&po->mapped) == 0) { err = 0; spin_lock_bh(&rb_queue->lock); swap(rb->pg_vec, pg_vec); rb->frame_max = (req->tp_frame_nr - 1); rb->head = 0; rb->frame_size = req->tp_frame_size; spin_unlock_bh(&rb_queue->lock); swap(rb->pg_vec_order, order); swap(rb->pg_vec_len, req->tp_block_nr); rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE; po->prot_hook.func = (po->rx_ring.pg_vec) ? tpacket_rcv : packet_rcv; skb_queue_purge(rb_queue); if (atomic_read(&po->mapped)) pr_err("packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped)); } mutex_unlock(&po->pg_vec_lock); spin_lock(&po->bind_lock); if (was_running) { po->num = num; register_prot_hook(sk); } spin_unlock(&po->bind_lock); if (closing && (po->tp_version > TPACKET_V2)) { /* Because we don't support block-based V3 on tx-ring */ if (!tx_ring) prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue); } release_sock(sk); if (pg_vec) free_pg_vec(pg_vec, order, req->tp_block_nr); out: return err; } static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); unsigned long size, expected_size; struct packet_ring_buffer *rb; unsigned long start; int err = -EINVAL; int i; if (vma->vm_pgoff) return -EINVAL; mutex_lock(&po->pg_vec_lock); expected_size = 0; for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) { if (rb->pg_vec) { expected_size += rb->pg_vec_len * rb->pg_vec_pages * PAGE_SIZE; } } if (expected_size == 0) goto out; size = vma->vm_end - vma->vm_start; if (size != expected_size) goto out; start = vma->vm_start; for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) { if (rb->pg_vec == NULL) continue; for (i = 0; i < rb->pg_vec_len; i++) { struct page *page; void *kaddr = rb->pg_vec[i].buffer; int pg_num; for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) { page = pgv_to_page(kaddr); err = vm_insert_page(vma, start, page); if (unlikely(err)) goto out; start += PAGE_SIZE; kaddr += PAGE_SIZE; } } } atomic_inc(&po->mapped); vma->vm_ops = &packet_mmap_ops; err = 0; out: mutex_unlock(&po->pg_vec_lock); return err; } static const struct proto_ops packet_ops_spkt = { .family = PF_PACKET, .owner = THIS_MODULE, .release = packet_release, .bind = packet_bind_spkt, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = packet_getname_spkt, .poll = datagram_poll, .ioctl = packet_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_no_setsockopt, .getsockopt = sock_no_getsockopt, .sendmsg = packet_sendmsg_spkt, .recvmsg = packet_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, }; static const struct proto_ops packet_ops = { .family = PF_PACKET, .owner = THIS_MODULE, .release = packet_release, .bind = packet_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = packet_getname, .poll = packet_poll, .ioctl = packet_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = packet_setsockopt, .getsockopt = packet_getsockopt, .sendmsg = packet_sendmsg, .recvmsg = packet_recvmsg, .mmap = packet_mmap, .sendpage = sock_no_sendpage, }; static const struct net_proto_family packet_family_ops = { .family = PF_PACKET, .create = packet_create, .owner = THIS_MODULE, }; static struct notifier_block packet_netdev_notifier = { .notifier_call = packet_notifier, }; #ifdef CONFIG_PROC_FS static void *packet_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct net *net = seq_file_net(seq); rcu_read_lock(); return seq_hlist_start_head_rcu(&net->packet.sklist, *pos); } static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct net *net = seq_file_net(seq); return seq_hlist_next_rcu(v, &net->packet.sklist, pos); } static void packet_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int packet_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n"); else { struct sock *s = sk_entry(v); const struct packet_sock *po = pkt_sk(s); seq_printf(seq, "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n", s, atomic_read(&s->sk_refcnt), s->sk_type, ntohs(po->num), po->ifindex, po->running, atomic_read(&s->sk_rmem_alloc), from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)), sock_i_ino(s)); } return 0; } static const struct seq_operations packet_seq_ops = { .start = packet_seq_start, .next = packet_seq_next, .stop = packet_seq_stop, .show = packet_seq_show, }; static int packet_seq_open(struct inode *inode, struct file *file) { return seq_open_net(inode, file, &packet_seq_ops, sizeof(struct seq_net_private)); } static const struct file_operations packet_seq_fops = { .owner = THIS_MODULE, .open = packet_seq_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release_net, }; #endif static int __net_init packet_net_init(struct net *net) { mutex_init(&net->packet.sklist_lock); INIT_HLIST_HEAD(&net->packet.sklist); if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops)) return -ENOMEM; return 0; } static void __net_exit packet_net_exit(struct net *net) { remove_proc_entry("packet", net->proc_net); } static struct pernet_operations packet_net_ops = { .init = packet_net_init, .exit = packet_net_exit, }; static void __exit packet_exit(void) { unregister_netdevice_notifier(&packet_netdev_notifier); unregister_pernet_subsys(&packet_net_ops); sock_unregister(PF_PACKET); proto_unregister(&packet_proto); } static int __init packet_init(void) { int rc = proto_register(&packet_proto, 0); if (rc != 0) goto out; sock_register(&packet_family_ops); register_pernet_subsys(&packet_net_ops); register_netdevice_notifier(&packet_netdev_notifier); out: return rc; } module_init(packet_init); module_exit(packet_exit); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_PACKET);