aboutsummaryrefslogtreecommitdiffstats
path: root/include/linux/tcp.h
blob: 4ad0706d40ebecf98e1ca95483a79f9dd44e903c (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		Definitions for the TCP protocol.
 *
 * Version:	@(#)tcp.h	1.0.2	04/28/93
 *
 * Author:	Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 */
#ifndef _LINUX_TCP_H
#define _LINUX_TCP_H


#include <linux/skbuff.h>
#include <linux/dmaengine.h>
#include <net/sock.h>
#include <net/inet_connection_sock.h>
#include <net/inet_timewait_sock.h>
#include <uapi/linux/tcp.h>

static inline struct tcphdr *tcp_hdr(const struct sk_buff *skb)
{
	return (struct tcphdr *)skb_transport_header(skb);
}

static inline unsigned int tcp_hdrlen(const struct sk_buff *skb)
{
	return tcp_hdr(skb)->doff * 4;
}

static inline struct tcphdr *inner_tcp_hdr(const struct sk_buff *skb)
{
	return (struct tcphdr *)skb_inner_transport_header(skb);
}

static inline unsigned int inner_tcp_hdrlen(const struct sk_buff *skb)
{
	return inner_tcp_hdr(skb)->doff * 4;
}

static inline unsigned int tcp_optlen(const struct sk_buff *skb)
{
	return (tcp_hdr(skb)->doff - 5) * 4;
}

/* TCP Fast Open */
#define TCP_FASTOPEN_COOKIE_MIN	4	/* Min Fast Open Cookie size in bytes */
#define TCP_FASTOPEN_COOKIE_MAX	16	/* Max Fast Open Cookie size in bytes */
#define TCP_FASTOPEN_COOKIE_SIZE 8	/* the size employed by this impl. */

/* TCP Fast Open Cookie as stored in memory */
struct tcp_fastopen_cookie {
	s8	len;
	u8	val[TCP_FASTOPEN_COOKIE_MAX];
};

/* This defines a selective acknowledgement block. */
struct tcp_sack_block_wire {
	__be32	start_seq;
	__be32	end_seq;
};

struct tcp_sack_block {
	u32	start_seq;
	u32	end_seq;
};

/*These are used to set the sack_ok field in struct tcp_options_received */
#define TCP_SACK_SEEN     (1 << 0)   /*1 = peer is SACK capable, */
#define TCP_FACK_ENABLED  (1 << 1)   /*1 = FACK is enabled locally*/
#define TCP_DSACK_SEEN    (1 << 2)   /*1 = DSACK was received from peer*/

struct tcp_options_received {
/*	PAWS/RTTM data	*/
	long	ts_recent_stamp;/* Time we stored ts_recent (for aging) */
	u32	ts_recent;	/* Time stamp to echo next		*/
	u32	rcv_tsval;	/* Time stamp value             	*/
	u32	rcv_tsecr;	/* Time stamp echo reply        	*/
	u16 	saw_tstamp : 1,	/* Saw TIMESTAMP on last packet		*/
		tstamp_ok : 1,	/* TIMESTAMP seen on SYN packet		*/
		dsack : 1,	/* D-SACK is scheduled			*/
		wscale_ok : 1,	/* Wscale seen on SYN packet		*/
		sack_ok : 4,	/* SACK seen on SYN packet		*/
		snd_wscale : 4,	/* Window scaling received from sender	*/
		rcv_wscale : 4;	/* Window scaling to send to receiver	*/
	u8	num_sacks;	/* Number of SACK blocks		*/
	u16	user_mss;	/* mss requested by user in ioctl	*/
	u16	mss_clamp;	/* Maximal mss, negotiated at connection setup */
};

static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
{
	rx_opt->tstamp_ok = rx_opt->sack_ok = 0;
	rx_opt->wscale_ok = rx_opt->snd_wscale = 0;
}

/* This is the max number of SACKS that we'll generate and process. It's safe
 * to increase this, although since:
 *   size = TCPOLEN_SACK_BASE_ALIGNED (4) + n * TCPOLEN_SACK_PERBLOCK (8)
 * only four options will fit in a standard TCP header */
#define TCP_NUM_SACKS 4

struct tcp_request_sock_ops;

struct tcp_request_sock {
	struct inet_request_sock 	req;
#ifdef CONFIG_TCP_MD5SIG
	/* Only used by TCP MD5 Signature so far. */
	const struct tcp_request_sock_ops *af_specific;
#endif
	struct sock			*listener; /* needed for TFO */
	u32				rcv_isn;
	u32				snt_isn;
	u32				snt_synack; /* synack sent time */
	u32				rcv_nxt; /* the ack # by SYNACK. For
						  * FastOpen it's the seq#
						  * after data-in-SYN.
						  */
};

static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
{
	return (struct tcp_request_sock *)req;
}

struct tcp_sock {
	/* inet_connection_sock has to be the first member of tcp_sock */
	struct inet_connection_sock	inet_conn;
	u16	tcp_header_len;	/* Bytes of tcp header to send		*/
	u16	xmit_size_goal_segs; /* Goal for segmenting output packets */

/*
 *	Header prediction flags
 *	0x5?10 << 16 + snd_wnd in net byte order
 */
	__be32	pred_flags;

/*
 *	RFC793 variables by their proper names. This means you can
 *	read the code and the spec side by side (and laugh ...)
 *	See RFC793 and RFC1122. The RFC writes these in capitals.
 */
 	u32	rcv_nxt;	/* What we want to receive next 	*/
	u32	copied_seq;	/* Head of yet unread data		*/
	u32	rcv_wup;	/* rcv_nxt on last window update sent	*/
 	u32	snd_nxt;	/* Next sequence we send		*/

 	u32	snd_una;	/* First byte we want an ack for	*/
 	u32	snd_sml;	/* Last byte of the most recently transmitted small packet */
	u32	rcv_tstamp;	/* timestamp of last received ACK (for keepalives) */
	u32	lsndtime;	/* timestamp of last sent data packet (for restart window) */

	u32	tsoffset;	/* timestamp offset */

	struct list_head tsq_node; /* anchor in tsq_tasklet.head list */
	unsigned long	tsq_flags;

	/* Data for direct copy to user */
	struct {
		struct sk_buff_head	prequeue;
		struct task_struct	*task;
		struct iovec		*iov;
		int			memory;
		int			len;
#ifdef CONFIG_NET_DMA
		/* members for async copy */
		struct dma_chan		*dma_chan;
		int			wakeup;
		struct dma_pinned_list	*pinned_list;
		dma_cookie_t		dma_cookie;
#endif
	} ucopy;

	u32	snd_wl1;	/* Sequence for window update		*/
	u32	snd_wnd;	/* The window we expect to receive	*/
	u32	max_window;	/* Maximal window ever seen from peer	*/
	u32	mss_cache;	/* Cached effective mss, not including SACKS */

	u32	window_clamp;	/* Maximal window to advertise		*/
	u32	rcv_ssthresh;	/* Current window clamp			*/

	u16	advmss;		/* Advertised MSS			*/
	u8	unused;
	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
		thin_lto    : 1,/* Use linear timeouts for thin streams */
		thin_dupack : 1,/* Fast retransmit on first dupack      */
		repair      : 1,
		frto        : 1;/* F-RTO (RFC5682) activated in CA_Loss */
	u8	repair_queue;
	u8	do_early_retrans:1,/* Enable RFC5827 early-retransmit  */
		syn_data:1,	/* SYN includes data */
		syn_fastopen:1,	/* SYN includes Fast Open option */
		syn_data_acked:1;/* data in SYN is acked by SYN-ACK */
	u32	tlp_high_seq;	/* snd_nxt at the time of TLP retransmit. */

/* RTT measurement */
	u32	srtt;		/* smoothed round trip time << 3	*/
	u32	mdev;		/* medium deviation			*/
	u32	mdev_max;	/* maximal mdev for the last rtt period	*/
	u32	rttvar;		/* smoothed mdev_max			*/
	u32	rtt_seq;	/* sequence number to update rttvar	*/

	u32	packets_out;	/* Packets which are "in flight"	*/
	u32	retrans_out;	/* Retransmitted packets out		*/

	u16	urg_data;	/* Saved octet of OOB data and control flags */
	u8	ecn_flags;	/* ECN status bits.			*/
	u8	reordering;	/* Packet reordering metric.		*/
	u32	snd_up;		/* Urgent pointer		*/

	u8	keepalive_probes; /* num of allowed keep alive probes	*/
/*
 *      Options received (usually on last packet, some only on SYN packets).
 */
	struct tcp_options_received rx_opt;

/*
 *	Slow start and congestion control (see also Nagle, and Karn & Partridge)
 */
 	u32	snd_ssthresh;	/* Slow start size threshold		*/
 	u32	snd_cwnd;	/* Sending congestion window		*/
	u32	snd_cwnd_cnt;	/* Linear increase counter		*/
	u32	snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */
	u32	snd_cwnd_used;
	u32	snd_cwnd_stamp;
	u32	prior_cwnd;	/* Congestion window at start of Recovery. */
	u32	prr_delivered;	/* Number of newly delivered packets to
				 * receiver in Recovery. */
	u32	prr_out;	/* Total number of pkts sent during Recovery. */

 	u32	rcv_wnd;	/* Current receiver window		*/
	u32	write_seq;	/* Tail(+1) of data held in tcp send buffer */
	u32	notsent_lowat;	/* TCP_NOTSENT_LOWAT */
	u32	pushed_seq;	/* Last pushed seq, required to talk to windows */
	u32	lost_out;	/* Lost packets			*/
	u32	sacked_out;	/* SACK'd packets			*/
	u32	fackets_out;	/* FACK'd packets			*/
	u32	tso_deferred;

	/* from STCP, retrans queue hinting */
	struct sk_buff* lost_skb_hint;
	struct sk_buff *retransmit_skb_hint;

	/* OOO segments go in this list. Note that socket lock must be held,
	 * as we do not use sk_buff_head lock.
	 */
	struct sk_buff_head	out_of_order_queue;

	/* SACKs data, these 2 need to be together (see tcp_options_write) */
	struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
	struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/

	struct tcp_sack_block recv_sack_cache[4];

	struct sk_buff *highest_sack;   /* skb just after the highest
					 * skb with SACKed bit set
					 * (validity guaranteed only if
					 * sacked_out > 0)
					 */

	int     lost_cnt_hint;
	u32     retransmit_high;	/* L-bits may be on up to this seqno */

	u32	lost_retrans_low;	/* Sent seq after any rxmit (lowest) */

	u32	prior_ssthresh; /* ssthresh saved at recovery start	*/
	u32	high_seq;	/* snd_nxt at onset of congestion	*/

	u32	retrans_stamp;	/* Timestamp of the last retransmit,
				 * also used in SYN-SENT to remember stamp of
				 * the first SYN. */
	u32	undo_marker;	/* tracking retrans started here. */
	int	undo_retrans;	/* number of undoable retransmissions. */
	u32	total_retrans;	/* Total retransmits for entire connection */

	u32	urg_seq;	/* Seq of received urgent pointer */
	unsigned int		keepalive_time;	  /* time before keep alive takes place */
	unsigned int		keepalive_intvl;  /* time interval between keep alive probes */

	int			linger2;

/* Receiver side RTT estimation */
	struct {
		u32	rtt;
		u32	seq;
		u32	time;
	} rcv_rtt_est;

/* Receiver queue space */
	struct {
		int	space;
		u32	seq;
		u32	time;
	} rcvq_space;

/* TCP-specific MTU probe information. */
	struct {
		u32		  probe_seq_start;
		u32		  probe_seq_end;
	} mtu_probe;
	u32	mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG
			   * while socket was owned by user.
			   */

#ifdef CONFIG_TCP_MD5SIG
/* TCP AF-Specific parts; only used by MD5 Signature support so far */
	const struct tcp_sock_af_ops	*af_specific;

/* TCP MD5 Signature Option information */
	struct tcp_md5sig_info	__rcu *md5sig_info;
#endif

/* TCP fastopen related information */
	struct tcp_fastopen_request *fastopen_req;
	/* fastopen_rsk points to request_sock that resulted in this big
	 * socket. Used to retransmit SYNACKs etc.
	 */
	struct request_sock *fastopen_rsk;
};

enum tsq_flags {
	TSQ_THROTTLED,
	TSQ_QUEUED,
	TCP_TSQ_DEFERRED,	   /* tcp_tasklet_func() found socket was owned */
	TCP_WRITE_TIMER_DEFERRED,  /* tcp_write_timer() found socket was owned */
	TCP_DELACK_TIMER_DEFERRED, /* tcp_delack_timer() found socket was owned */
	TCP_MTU_REDUCED_DEFERRED,  /* tcp_v{4|6}_err() could not call
				    * tcp_v{4|6}_mtu_reduced()
				    */
};

static inline struct tcp_sock *tcp_sk(const struct sock *sk)
{
	return (struct tcp_sock *)sk;
}

struct tcp_timewait_sock {
	struct inet_timewait_sock tw_sk;
	u32			  tw_rcv_nxt;
	u32			  tw_snd_nxt;
	u32			  tw_rcv_wnd;
	u32			  tw_ts_offset;
	u32			  tw_ts_recent;
	long			  tw_ts_recent_stamp;
#ifdef CONFIG_TCP_MD5SIG
	struct tcp_md5sig_key	  *tw_md5_key;
#endif
};

static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
{
	return (struct tcp_timewait_sock *)sk;
}

static inline bool tcp_passive_fastopen(const struct sock *sk)
{
	return (sk->sk_state == TCP_SYN_RECV &&
		tcp_sk(sk)->fastopen_rsk != NULL);
}

static inline bool fastopen_cookie_present(struct tcp_fastopen_cookie *foc)
{
	return foc->len != -1;
}

extern void tcp_sock_destruct(struct sock *sk);

static inline int fastopen_init_queue(struct sock *sk, int backlog)
{
	struct request_sock_queue *queue =
	    &inet_csk(sk)->icsk_accept_queue;

	if (queue->fastopenq == NULL) {
		queue->fastopenq = kzalloc(
		    sizeof(struct fastopen_queue),
		    sk->sk_allocation);
		if (queue->fastopenq == NULL)
			return -ENOMEM;

		sk->sk_destruct = tcp_sock_destruct;
		spin_lock_init(&queue->fastopenq->lock);
	}
	queue->fastopenq->max_qlen = backlog;
	return 0;
}

#endif	/* _LINUX_TCP_H */
eted_pages > 0) { for (i = 0; i < nr_completed_pages; i++) { int di = completed_pages[i]; dp = dest_pages[di]; /* * If we are outside the initialized size, zero * the out of bounds page range. */ handle_bounds_compressed_page(dp, i_size, initialized_size); flush_dcache_page(dp); kunmap(dp); SetPageUptodate(dp); unlock_page(dp); if (di == xpage) *xpage_done = 1; else put_page(dp); dest_pages[di] = NULL; } } return err; } /* Setup offsets for the current sub-block destination. */ do_sb_start = *dest_ofs; do_sb_end = do_sb_start + NTFS_SB_SIZE; /* Check that we are still within allowed boundaries. */ if (*dest_index == dest_max_index && do_sb_end > dest_max_ofs) goto return_overflow; /* Does the minimum size of a compressed sb overflow valid range? */ if (cb + 6 > cb_end) goto return_overflow; /* Setup the current sub-block source pointers and validate range. */ cb_sb_start = cb; cb_sb_end = cb_sb_start + (le16_to_cpup((le16*)cb) & NTFS_SB_SIZE_MASK) + 3; if (cb_sb_end > cb_end) goto return_overflow; /* Get the current destination page. */ dp = dest_pages[*dest_index]; if (!dp) { /* No page present. Skip decompression of this sub-block. */ cb = cb_sb_end; /* Advance destination position to next sub-block. */ *dest_ofs = (*dest_ofs + NTFS_SB_SIZE) & ~PAGE_MASK; if (!*dest_ofs && (++*dest_index > dest_max_index)) goto return_overflow; goto do_next_sb; } /* We have a valid destination page. Setup the destination pointers. */ dp_addr = (u8*)page_address(dp) + do_sb_start; /* Now, we are ready to process the current sub-block (sb). */ if (!(le16_to_cpup((le16*)cb) & NTFS_SB_IS_COMPRESSED)) { ntfs_debug("Found uncompressed sub-block."); /* This sb is not compressed, just copy it into destination. */ /* Advance source position to first data byte. */ cb += 2; /* An uncompressed sb must be full size. */ if (cb_sb_end - cb != NTFS_SB_SIZE) goto return_overflow; /* Copy the block and advance the source position. */ memcpy(dp_addr, cb, NTFS_SB_SIZE); cb += NTFS_SB_SIZE; /* Advance destination position to next sub-block. */ *dest_ofs += NTFS_SB_SIZE; if (!(*dest_ofs &= ~PAGE_MASK)) { finalize_page: /* * First stage: add current page index to array of * completed pages. */ completed_pages[nr_completed_pages++] = *dest_index; if (++*dest_index > dest_max_index) goto return_overflow; } goto do_next_sb; } ntfs_debug("Found compressed sub-block."); /* This sb is compressed, decompress it into destination. */ /* Setup destination pointers. */ dp_sb_start = dp_addr; dp_sb_end = dp_sb_start + NTFS_SB_SIZE; /* Forward to the first tag in the sub-block. */ cb += 2; do_next_tag: if (cb == cb_sb_end) { /* Check if the decompressed sub-block was not full-length. */ if (dp_addr < dp_sb_end) { int nr_bytes = do_sb_end - *dest_ofs; ntfs_debug("Filling incomplete sub-block with " "zeroes."); /* Zero remainder and update destination position. */ memset(dp_addr, 0, nr_bytes); *dest_ofs += nr_bytes; } /* We have finished the current sub-block. */ if (!(*dest_ofs &= ~PAGE_MASK)) goto finalize_page; goto do_next_sb; } /* Check we are still in range. */ if (cb > cb_sb_end || dp_addr > dp_sb_end) goto return_overflow; /* Get the next tag and advance to first token. */ tag = *cb++; /* Parse the eight tokens described by the tag. */ for (token = 0; token < 8; token++, tag >>= 1) { u16 lg, pt, length, max_non_overlap; register u16 i; u8 *dp_back_addr; /* Check if we are done / still in range. */ if (cb >= cb_sb_end || dp_addr > dp_sb_end) break; /* Determine token type and parse appropriately.*/ if ((tag & NTFS_TOKEN_MASK) == NTFS_SYMBOL_TOKEN) { /* * We have a symbol token, copy the symbol across, and * advance the source and destination positions. */ *dp_addr++ = *cb++; ++*dest_ofs; /* Continue with the next token. */ continue; } /* * We have a phrase token. Make sure it is not the first tag in * the sb as this is illegal and would confuse the code below. */ if (dp_addr == dp_sb_start) goto return_overflow; /* * Determine the number of bytes to go back (p) and the number * of bytes to copy (l). We use an optimized algorithm in which * we first calculate log2(current destination position in sb), * which allows determination of l and p in O(1) rather than * O(n). We just need an arch-optimized log2() function now. */ lg = 0; for (i = *dest_ofs - do_sb_start - 1; i >= 0x10; i >>= 1) lg++; /* Get the phrase token into i. */ pt = le16_to_cpup((le16*)cb); /* * Calculate starting position of the byte sequence in * the destination using the fact that p = (pt >> (12 - lg)) + 1 * and make sure we don't go too far back. */ dp_back_addr = dp_addr - (pt >> (12 - lg)) - 1; if (dp_back_addr < dp_sb_start) goto return_overflow; /* Now calculate the length of the byte sequence. */ length = (pt & (0xfff >> lg)) + 3; /* Advance destination position and verify it is in range. */ *dest_ofs += length; if (*dest_ofs > do_sb_end) goto return_overflow; /* The number of non-overlapping bytes. */ max_non_overlap = dp_addr - dp_back_addr; if (length <= max_non_overlap) { /* The byte sequence doesn't overlap, just copy it. */ memcpy(dp_addr, dp_back_addr, length); /* Advance destination pointer. */ dp_addr += length; } else { /* * The byte sequence does overlap, copy non-overlapping * part and then do a slow byte by byte copy for the * overlapping part. Also, advance the destination * pointer. */ memcpy(dp_addr, dp_back_addr, max_non_overlap); dp_addr += max_non_overlap; dp_back_addr += max_non_overlap; length -= max_non_overlap; while (length--) *dp_addr++ = *dp_back_addr++; } /* Advance source position and continue with the next token. */ cb += 2; } /* No tokens left in the current tag. Continue with the next tag. */ goto do_next_tag; return_overflow: ntfs_error(NULL, "Failed. Returning -EOVERFLOW."); goto return_error; } /** * ntfs_read_compressed_block - read a compressed block into the page cache * @page: locked page in the compression block(s) we need to read * * When we are called the page has already been verified to be locked and the * attribute is known to be non-resident, not encrypted, but compressed. * * 1. Determine which compression block(s) @page is in. * 2. Get hold of all pages corresponding to this/these compression block(s). * 3. Read the (first) compression block. * 4. Decompress it into the corresponding pages. * 5. Throw the compressed data away and proceed to 3. for the next compression * block or return success if no more compression blocks left. * * Warning: We have to be careful what we do about existing pages. They might * have been written to so that we would lose data if we were to just overwrite * them with the out-of-date uncompressed data. * * FIXME: For PAGE_SIZE > cb_size we are not doing the Right Thing(TM) at * the end of the file I think. We need to detect this case and zero the out * of bounds remainder of the page in question and mark it as handled. At the * moment we would just return -EIO on such a page. This bug will only become * apparent if pages are above 8kiB and the NTFS volume only uses 512 byte * clusters so is probably not going to be seen by anyone. Still this should * be fixed. (AIA) * * FIXME: Again for PAGE_SIZE > cb_size we are screwing up both in * handling sparse and compressed cbs. (AIA) * * FIXME: At the moment we don't do any zeroing out in the case that * initialized_size is less than data_size. This should be safe because of the * nature of the compression algorithm used. Just in case we check and output * an error message in read inode if the two sizes are not equal for a * compressed file. (AIA) */ int ntfs_read_compressed_block(struct page *page) { loff_t i_size; s64 initialized_size; struct address_space *mapping = page->mapping; ntfs_inode *ni = NTFS_I(mapping->host); ntfs_volume *vol = ni->vol; struct super_block *sb = vol->sb; runlist_element *rl; unsigned long flags, block_size = sb->s_blocksize; unsigned char block_size_bits = sb->s_blocksize_bits; u8 *cb, *cb_pos, *cb_end; struct buffer_head **bhs; unsigned long offset, index = page->index; u32 cb_size = ni->itype.compressed.block_size; u64 cb_size_mask = cb_size - 1UL; VCN vcn; LCN lcn; /* The first wanted vcn (minimum alignment is PAGE_SIZE). */ VCN start_vcn = (((s64)index << PAGE_SHIFT) & ~cb_size_mask) >> vol->cluster_size_bits; /* * The first vcn after the last wanted vcn (minimum alignment is again * PAGE_SIZE. */ VCN end_vcn = ((((s64)(index + 1UL) << PAGE_SHIFT) + cb_size - 1) & ~cb_size_mask) >> vol->cluster_size_bits; /* Number of compression blocks (cbs) in the wanted vcn range. */ unsigned int nr_cbs = (end_vcn - start_vcn) << vol->cluster_size_bits >> ni->itype.compressed.block_size_bits; /* * Number of pages required to store the uncompressed data from all * compression blocks (cbs) overlapping @page. Due to alignment * guarantees of start_vcn and end_vcn, no need to round up here. */ unsigned int nr_pages = (end_vcn - start_vcn) << vol->cluster_size_bits >> PAGE_SHIFT; unsigned int xpage, max_page, cur_page, cur_ofs, i; unsigned int cb_clusters, cb_max_ofs; int block, max_block, cb_max_page, bhs_size, nr_bhs, err = 0; struct page **pages; unsigned char xpage_done = 0; ntfs_debug("Entering, page->index = 0x%lx, cb_size = 0x%x, nr_pages = " "%i.", index, cb_size, nr_pages); /* * Bad things happen if we get here for anything that is not an * unnamed $DATA attribute. */ BUG_ON(ni->type != AT_DATA); BUG_ON(ni->name_len); pages = kmalloc(nr_pages * sizeof(struct page *), GFP_NOFS); /* Allocate memory to store the buffer heads we need. */ bhs_size = cb_size / block_size * sizeof(struct buffer_head *); bhs = kmalloc(bhs_size, GFP_NOFS); if (unlikely(!pages || !bhs)) { kfree(bhs); kfree(pages); unlock_page(page); ntfs_error(vol->sb, "Failed to allocate internal buffers."); return -ENOMEM; } /* * We have already been given one page, this is the one we must do. * Once again, the alignment guarantees keep it simple. */ offset = start_vcn << vol->cluster_size_bits >> PAGE_SHIFT; xpage = index - offset; pages[xpage] = page; /* * The remaining pages need to be allocated and inserted into the page * cache, alignment guarantees keep all the below much simpler. (-8 */ read_lock_irqsave(&ni->size_lock, flags); i_size = i_size_read(VFS_I(ni)); initialized_size = ni->initialized_size; read_unlock_irqrestore(&ni->size_lock, flags); max_page = ((i_size + PAGE_SIZE - 1) >> PAGE_SHIFT) - offset; /* Is the page fully outside i_size? (truncate in progress) */ if (xpage >= max_page) { kfree(bhs); kfree(pages); zero_user(page, 0, PAGE_SIZE); ntfs_debug("Compressed read outside i_size - truncated?"); SetPageUptodate(page); unlock_page(page); return 0; } if (nr_pages < max_page) max_page = nr_pages; for (i = 0; i < max_page; i++, offset++) { if (i != xpage) pages[i] = grab_cache_page_nowait(mapping, offset); page = pages[i]; if (page) { /* * We only (re)read the page if it isn't already read * in and/or dirty or we would be losing data or at * least wasting our time. */ if (!PageDirty(page) && (!PageUptodate(page) || PageError(page))) { ClearPageError(page); kmap(page); continue; } unlock_page(page); put_page(page); pages[i] = NULL; } } /* * We have the runlist, and all the destination pages we need to fill. * Now read the first compression block. */ cur_page = 0; cur_ofs = 0; cb_clusters = ni->itype.compressed.block_clusters; do_next_cb: nr_cbs--; nr_bhs = 0; /* Read all cb buffer heads one cluster at a time. */ rl = NULL; for (vcn = start_vcn, start_vcn += cb_clusters; vcn < start_vcn; vcn++) { bool is_retry = false; if (!rl) { lock_retry_remap: down_read(&ni->runlist.lock); rl = ni->runlist.rl; } if (likely(rl != NULL)) { /* Seek to element containing target vcn. */ while (rl->length && rl[1].vcn <= vcn) rl++; lcn = ntfs_rl_vcn_to_lcn(rl, vcn); } else lcn = LCN_RL_NOT_MAPPED; ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.", (unsigned long long)vcn, (unsigned long long)lcn); if (lcn < 0) { /* * When we reach the first sparse cluster we have * finished with the cb. */ if (lcn == LCN_HOLE) break; if (is_retry || lcn != LCN_RL_NOT_MAPPED) goto rl_err; is_retry = true; /* * Attempt to map runlist, dropping lock for the * duration. */ up_read(&ni->runlist.lock); if (!ntfs_map_runlist(ni, vcn)) goto lock_retry_remap; goto map_rl_err; } block = lcn << vol->cluster_size_bits >> block_size_bits; /* Read the lcn from device in chunks of block_size bytes. */ max_block = block + (vol->cluster_size >> block_size_bits); do { ntfs_debug("block = 0x%x.", block); if (unlikely(!(bhs[nr_bhs] = sb_getblk(sb, block)))) goto getblk_err; nr_bhs++; } while (++block < max_block); } /* Release the lock if we took it. */ if (rl) up_read(&ni->runlist.lock); /* Setup and initiate io on all buffer heads. */ for (i = 0; i < nr_bhs; i++) { struct buffer_head *tbh = bhs[i]; if (!trylock_buffer(tbh)) continue; if (unlikely(buffer_uptodate(tbh))) { unlock_buffer(tbh); continue; } get_bh(tbh); tbh->b_end_io = end_buffer_read_sync; submit_bh(REQ_OP_READ, 0, tbh); } /* Wait for io completion on all buffer heads. */ for (i = 0; i < nr_bhs; i++) { struct buffer_head *tbh = bhs[i]; if (buffer_uptodate(tbh)) continue; wait_on_buffer(tbh); /* * We need an optimization barrier here, otherwise we start * hitting the below fixup code when accessing a loopback * mounted ntfs partition. This indicates either there is a * race condition in the loop driver or, more likely, gcc * overoptimises the code without the barrier and it doesn't * do the Right Thing(TM). */ barrier(); if (unlikely(!buffer_uptodate(tbh))) { ntfs_warning(vol->sb, "Buffer is unlocked but not " "uptodate! Unplugging the disk queue " "and rescheduling."); get_bh(tbh); io_schedule(); put_bh(tbh); if (unlikely(!buffer_uptodate(tbh))) goto read_err; ntfs_warning(vol->sb, "Buffer is now uptodate. Good."); } } /* * Get the compression buffer. We must not sleep any more * until we are finished with it. */ spin_lock(&ntfs_cb_lock); cb = ntfs_compression_buffer; BUG_ON(!cb); cb_pos = cb; cb_end = cb + cb_size; /* Copy the buffer heads into the contiguous buffer. */ for (i = 0; i < nr_bhs; i++) { memcpy(cb_pos, bhs[i]->b_data, block_size); cb_pos += block_size; } /* Just a precaution. */ if (cb_pos + 2 <= cb + cb_size) *(u16*)cb_pos = 0; /* Reset cb_pos back to the beginning. */ cb_pos = cb; /* We now have both source (if present) and destination. */ ntfs_debug("Successfully read the compression block."); /* The last page and maximum offset within it for the current cb. */ cb_max_page = (cur_page << PAGE_SHIFT) + cur_ofs + cb_size; cb_max_ofs = cb_max_page & ~PAGE_MASK; cb_max_page >>= PAGE_SHIFT; /* Catch end of file inside a compression block. */ if (cb_max_page > max_page) cb_max_page = max_page; if (vcn == start_vcn - cb_clusters) { /* Sparse cb, zero out page range overlapping the cb. */ ntfs_debug("Found sparse compression block."); /* We can sleep from now on, so we drop lock. */ spin_unlock(&ntfs_cb_lock); if (cb_max_ofs) cb_max_page--; for (; cur_page < cb_max_page; cur_page++) { page = pages[cur_page]; if (page) { if (likely(!cur_ofs)) clear_page(page_address(page)); else memset(page_address(page) + cur_ofs, 0, PAGE_SIZE - cur_ofs); flush_dcache_page(page); kunmap(page); SetPageUptodate(page); unlock_page(page); if (cur_page == xpage) xpage_done = 1; else put_page(page); pages[cur_page] = NULL; } cb_pos += PAGE_SIZE - cur_ofs; cur_ofs = 0; if (cb_pos >= cb_end) break; } /* If we have a partial final page, deal with it now. */ if (cb_max_ofs && cb_pos < cb_end) { page = pages[cur_page]; if (page) memset(page_address(page) + cur_ofs, 0, cb_max_ofs - cur_ofs); /* * No need to update cb_pos at this stage: * cb_pos += cb_max_ofs - cur_ofs; */ cur_ofs = cb_max_ofs; } } else if (vcn == start_vcn) { /* We can't sleep so we need two stages. */ unsigned int cur2_page = cur_page; unsigned int cur_ofs2 = cur_ofs; u8 *cb_pos2 = cb_pos; ntfs_debug("Found uncompressed compression block."); /* Uncompressed cb, copy it to the destination pages. */ /* * TODO: As a big optimization, we could detect this case * before we read all the pages and use block_read_full_page() * on all full pages instead (we still have to treat partial * pages especially but at least we are getting rid of the * synchronous io for the majority of pages. * Or if we choose not to do the read-ahead/-behind stuff, we * could just return block_read_full_page(pages[xpage]) as long * as PAGE_SIZE <= cb_size. */ if (cb_max_ofs) cb_max_page--; /* First stage: copy data into destination pages. */ for (; cur_page < cb_max_page; cur_page++) { page = pages[cur_page]; if (page) memcpy(page_address(page) + cur_ofs, cb_pos, PAGE_SIZE - cur_ofs); cb_pos += PAGE_SIZE - cur_ofs; cur_ofs = 0; if (cb_pos >= cb_end) break; } /* If we have a partial final page, deal with it now. */ if (cb_max_ofs && cb_pos < cb_end) { page = pages[cur_page]; if (page) memcpy(page_address(page) + cur_ofs, cb_pos, cb_max_ofs - cur_ofs); cb_pos += cb_max_ofs - cur_ofs; cur_ofs = cb_max_ofs; } /* We can sleep from now on, so drop lock. */ spin_unlock(&ntfs_cb_lock); /* Second stage: finalize pages. */ for (; cur2_page < cb_max_page; cur2_page++) { page = pages[cur2_page]; if (page) { /* * If we are outside the initialized size, zero * the out of bounds page range. */ handle_bounds_compressed_page(page, i_size, initialized_size); flush_dcache_page(page); kunmap(page); SetPageUptodate(page); unlock_page(page); if (cur2_page == xpage) xpage_done = 1; else put_page(page); pages[cur2_page] = NULL; } cb_pos2 += PAGE_SIZE - cur_ofs2; cur_ofs2 = 0; if (cb_pos2 >= cb_end) break; } } else { /* Compressed cb, decompress it into the destination page(s). */ unsigned int prev_cur_page = cur_page; ntfs_debug("Found compressed compression block."); err = ntfs_decompress(pages, &cur_page, &cur_ofs, cb_max_page, cb_max_ofs, xpage, &xpage_done, cb_pos, cb_size - (cb_pos - cb), i_size, initialized_size); /* * We can sleep from now on, lock already dropped by * ntfs_decompress(). */ if (err) { ntfs_error(vol->sb, "ntfs_decompress() failed in inode " "0x%lx with error code %i. Skipping " "this compression block.", ni->mft_no, -err); /* Release the unfinished pages. */ for (; prev_cur_page < cur_page; prev_cur_page++) { page = pages[prev_cur_page]; if (page) { flush_dcache_page(page); kunmap(page); unlock_page(page); if (prev_cur_page != xpage) put_page(page); pages[prev_cur_page] = NULL; } } } } /* Release the buffer heads. */ for (i = 0; i < nr_bhs; i++) brelse(bhs[i]); /* Do we have more work to do? */ if (nr_cbs) goto do_next_cb; /* We no longer need the list of buffer heads. */ kfree(bhs); /* Clean up if we have any pages left. Should never happen. */ for (cur_page = 0; cur_page < max_page; cur_page++) { page = pages[cur_page]; if (page) { ntfs_error(vol->sb, "Still have pages left! " "Terminating them with extreme " "prejudice. Inode 0x%lx, page index " "0x%lx.", ni->mft_no, page->index); flush_dcache_page(page); kunmap(page); unlock_page(page); if (cur_page != xpage) put_page(page); pages[cur_page] = NULL; } } /* We no longer need the list of pages. */ kfree(pages); /* If we have completed the requested page, we return success. */ if (likely(xpage_done)) return 0; ntfs_debug("Failed. Returning error code %s.", err == -EOVERFLOW ? "EOVERFLOW" : (!err ? "EIO" : "unknown error")); return err < 0 ? err : -EIO; read_err: ntfs_error(vol->sb, "IO error while reading compressed data."); /* Release the buffer heads. */ for (i = 0; i < nr_bhs; i++) brelse(bhs[i]); goto err_out; map_rl_err: ntfs_error(vol->sb, "ntfs_map_runlist() failed. Cannot read " "compression block."); goto err_out; rl_err: up_read(&ni->runlist.lock); ntfs_error(vol->sb, "ntfs_rl_vcn_to_lcn() failed. Cannot read " "compression block."); goto err_out; getblk_err: up_read(&ni->runlist.lock); ntfs_error(vol->sb, "getblk() failed. Cannot read compression block."); err_out: kfree(bhs); for (i = cur_page; i < max_page; i++) { page = pages[i]; if (page) { flush_dcache_page(page); kunmap(page); unlock_page(page); if (i != xpage) put_page(page); } } kfree(pages); return -EIO; }