aboutsummaryrefslogtreecommitdiffstats
path: root/include/linux/ext3_fs.h
blob: 6ce1bca01724dd1408e1f73e60fcdd58caad345c (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
/*
 *  linux/include/linux/ext3_fs.h
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  from
 *
 *  linux/include/linux/minix_fs.h
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#ifndef _LINUX_EXT3_FS_H
#define _LINUX_EXT3_FS_H

#include <linux/types.h>
#include <linux/magic.h>

/*
 * The second extended filesystem constants/structures
 */

/*
 * Define EXT3FS_DEBUG to produce debug messages
 */
#undef EXT3FS_DEBUG

/*
 * Define EXT3_RESERVATION to reserve data blocks for expanding files
 */
#define EXT3_DEFAULT_RESERVE_BLOCKS     8
/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */
#define EXT3_MAX_RESERVE_BLOCKS         1027
#define EXT3_RESERVE_WINDOW_NOT_ALLOCATED 0

/*
 * Debug code
 */
#ifdef EXT3FS_DEBUG
#define ext3_debug(f, a...)						\
	do {								\
		printk (KERN_DEBUG "EXT3-fs DEBUG (%s, %d): %s:",	\
			__FILE__, __LINE__, __func__);		\
		printk (KERN_DEBUG f, ## a);				\
	} while (0)
#else
#define ext3_debug(f, a...)	do {} while (0)
#endif

/*
 * Special inodes numbers
 */
#define	EXT3_BAD_INO		 1	/* Bad blocks inode */
#define EXT3_ROOT_INO		 2	/* Root inode */
#define EXT3_BOOT_LOADER_INO	 5	/* Boot loader inode */
#define EXT3_UNDEL_DIR_INO	 6	/* Undelete directory inode */
#define EXT3_RESIZE_INO		 7	/* Reserved group descriptors inode */
#define EXT3_JOURNAL_INO	 8	/* Journal inode */

/* First non-reserved inode for old ext3 filesystems */
#define EXT3_GOOD_OLD_FIRST_INO	11

/*
 * Maximal count of links to a file
 */
#define EXT3_LINK_MAX		32000

/*
 * Macro-instructions used to manage several block sizes
 */
#define EXT3_MIN_BLOCK_SIZE		1024
#define	EXT3_MAX_BLOCK_SIZE		65536
#define EXT3_MIN_BLOCK_LOG_SIZE		10
#ifdef __KERNEL__
# define EXT3_BLOCK_SIZE(s)		((s)->s_blocksize)
#else
# define EXT3_BLOCK_SIZE(s)		(EXT3_MIN_BLOCK_SIZE << (s)->s_log_block_size)
#endif
#define	EXT3_ADDR_PER_BLOCK(s)		(EXT3_BLOCK_SIZE(s) / sizeof (__u32))
#ifdef __KERNEL__
# define EXT3_BLOCK_SIZE_BITS(s)	((s)->s_blocksize_bits)
#else
# define EXT3_BLOCK_SIZE_BITS(s)	((s)->s_log_block_size + 10)
#endif
#ifdef __KERNEL__
#define	EXT3_ADDR_PER_BLOCK_BITS(s)	(EXT3_SB(s)->s_addr_per_block_bits)
#define EXT3_INODE_SIZE(s)		(EXT3_SB(s)->s_inode_size)
#define EXT3_FIRST_INO(s)		(EXT3_SB(s)->s_first_ino)
#else
#define EXT3_INODE_SIZE(s)	(((s)->s_rev_level == EXT3_GOOD_OLD_REV) ? \
				 EXT3_GOOD_OLD_INODE_SIZE : \
				 (s)->s_inode_size)
#define EXT3_FIRST_INO(s)	(((s)->s_rev_level == EXT3_GOOD_OLD_REV) ? \
				 EXT3_GOOD_OLD_FIRST_INO : \
				 (s)->s_first_ino)
#endif

/*
 * Macro-instructions used to manage fragments
 */
#define EXT3_MIN_FRAG_SIZE		1024
#define	EXT3_MAX_FRAG_SIZE		4096
#define EXT3_MIN_FRAG_LOG_SIZE		  10
#ifdef __KERNEL__
# define EXT3_FRAG_SIZE(s)		(EXT3_SB(s)->s_frag_size)
# define EXT3_FRAGS_PER_BLOCK(s)	(EXT3_SB(s)->s_frags_per_block)
#else
# define EXT3_FRAG_SIZE(s)		(EXT3_MIN_FRAG_SIZE << (s)->s_log_frag_size)
# define EXT3_FRAGS_PER_BLOCK(s)	(EXT3_BLOCK_SIZE(s) / EXT3_FRAG_SIZE(s))
#endif

/*
 * Structure of a blocks group descriptor
 */
struct ext3_group_desc
{
	__le32	bg_block_bitmap;		/* Blocks bitmap block */
	__le32	bg_inode_bitmap;		/* Inodes bitmap block */
	__le32	bg_inode_table;		/* Inodes table block */
	__le16	bg_free_blocks_count;	/* Free blocks count */
	__le16	bg_free_inodes_count;	/* Free inodes count */
	__le16	bg_used_dirs_count;	/* Directories count */
	__u16	bg_pad;
	__le32	bg_reserved[3];
};

/*
 * Macro-instructions used to manage group descriptors
 */
#ifdef __KERNEL__
# define EXT3_BLOCKS_PER_GROUP(s)	(EXT3_SB(s)->s_blocks_per_group)
# define EXT3_DESC_PER_BLOCK(s)		(EXT3_SB(s)->s_desc_per_block)
# define EXT3_INODES_PER_GROUP(s)	(EXT3_SB(s)->s_inodes_per_group)
# define EXT3_DESC_PER_BLOCK_BITS(s)	(EXT3_SB(s)->s_desc_per_block_bits)
#else
# define EXT3_BLOCKS_PER_GROUP(s)	((s)->s_blocks_per_group)
# define EXT3_DESC_PER_BLOCK(s)		(EXT3_BLOCK_SIZE(s) / sizeof (struct ext3_group_desc))
# define EXT3_INODES_PER_GROUP(s)	((s)->s_inodes_per_group)
#endif

/*
 * Constants relative to the data blocks
 */
#define	EXT3_NDIR_BLOCKS		12
#define	EXT3_IND_BLOCK			EXT3_NDIR_BLOCKS
#define	EXT3_DIND_BLOCK			(EXT3_IND_BLOCK + 1)
#define	EXT3_TIND_BLOCK			(EXT3_DIND_BLOCK + 1)
#define	EXT3_N_BLOCKS			(EXT3_TIND_BLOCK + 1)

/*
 * Inode flags
 */
#define	EXT3_SECRM_FL			0x00000001 /* Secure deletion */
#define	EXT3_UNRM_FL			0x00000002 /* Undelete */
#define	EXT3_COMPR_FL			0x00000004 /* Compress file */
#define EXT3_SYNC_FL			0x00000008 /* Synchronous updates */
#define EXT3_IMMUTABLE_FL		0x00000010 /* Immutable file */
#define EXT3_APPEND_FL			0x00000020 /* writes to file may only append */
#define EXT3_NODUMP_FL			0x00000040 /* do not dump file */
#define EXT3_NOATIME_FL			0x00000080 /* do not update atime */
/* Reserved for compression usage... */
#define EXT3_DIRTY_FL			0x00000100
#define EXT3_COMPRBLK_FL		0x00000200 /* One or more compressed clusters */
#define EXT3_NOCOMPR_FL			0x00000400 /* Don't compress */
#define EXT3_ECOMPR_FL			0x00000800 /* Compression error */
/* End compression flags --- maybe not all used */
#define EXT3_INDEX_FL			0x00001000 /* hash-indexed directory */
#define EXT3_IMAGIC_FL			0x00002000 /* AFS directory */
#define EXT3_JOURNAL_DATA_FL		0x00004000 /* file data should be journaled */
#define EXT3_NOTAIL_FL			0x00008000 /* file tail should not be merged */
#define EXT3_DIRSYNC_FL			0x00010000 /* dirsync behaviour (directories only) */
#define EXT3_TOPDIR_FL			0x00020000 /* Top of directory hierarchies*/
#define EXT3_RESERVED_FL		0x80000000 /* reserved for ext3 lib */

#define EXT3_FL_USER_VISIBLE		0x0003DFFF /* User visible flags */
#define EXT3_FL_USER_MODIFIABLE		0x000380FF /* User modifiable flags */

/* Flags that should be inherited by new inodes from their parent. */
#define EXT3_FL_INHERITED (EXT3_SECRM_FL | EXT3_UNRM_FL | EXT3_COMPR_FL |\
			   EXT3_SYNC_FL | EXT3_IMMUTABLE_FL | EXT3_APPEND_FL |\
			   EXT3_NODUMP_FL | EXT3_NOATIME_FL | EXT3_COMPRBLK_FL|\
			   EXT3_NOCOMPR_FL | EXT3_JOURNAL_DATA_FL |\
			   EXT3_NOTAIL_FL | EXT3_DIRSYNC_FL)

/* Flags that are appropriate for regular files (all but dir-specific ones). */
#define EXT3_REG_FLMASK (~(EXT3_DIRSYNC_FL | EXT3_TOPDIR_FL))

/* Flags that are appropriate for non-directories/regular files. */
#define EXT3_OTHER_FLMASK (EXT3_NODUMP_FL | EXT3_NOATIME_FL)

/* Mask out flags that are inappropriate for the given type of inode. */
static inline __u32 ext3_mask_flags(umode_t mode, __u32 flags)
{
	if (S_ISDIR(mode))
		return flags;
	else if (S_ISREG(mode))
		return flags & EXT3_REG_FLMASK;
	else
		return flags & EXT3_OTHER_FLMASK;
}

/* Used to pass group descriptor data when online resize is done */
struct ext3_new_group_input {
	__u32 group;            /* Group number for this data */
	__u32 block_bitmap;     /* Absolute block number of block bitmap */
	__u32 inode_bitmap;     /* Absolute block number of inode bitmap */
	__u32 inode_table;      /* Absolute block number of inode table start */
	__u32 blocks_count;     /* Total number of blocks in this group */
	__u16 reserved_blocks;  /* Number of reserved blocks in this group */
	__u16 unused;
};

/* The struct ext3_new_group_input in kernel space, with free_blocks_count */
struct ext3_new_group_data {
	__u32 group;
	__u32 block_bitmap;
	__u32 inode_bitmap;
	__u32 inode_table;
	__u32 blocks_count;
	__u16 reserved_blocks;
	__u16 unused;
	__u32 free_blocks_count;
};


/*
 * ioctl commands
 */
#define	EXT3_IOC_GETFLAGS		FS_IOC_GETFLAGS
#define	EXT3_IOC_SETFLAGS		FS_IOC_SETFLAGS
#define	EXT3_IOC_GETVERSION		_IOR('f', 3, long)
#define	EXT3_IOC_SETVERSION		_IOW('f', 4, long)
#define EXT3_IOC_GROUP_EXTEND		_IOW('f', 7, unsigned long)
#define EXT3_IOC_GROUP_ADD		_IOW('f', 8,struct ext3_new_group_input)
#define	EXT3_IOC_GETVERSION_OLD		FS_IOC_GETVERSION
#define	EXT3_IOC_SETVERSION_OLD		FS_IOC_SETVERSION
#ifdef CONFIG_JBD_DEBUG
#define EXT3_IOC_WAIT_FOR_READONLY	_IOR('f', 99, long)
#endif
#define EXT3_IOC_GETRSVSZ		_IOR('f', 5, long)
#define EXT3_IOC_SETRSVSZ		_IOW('f', 6, long)

/*
 * ioctl commands in 32 bit emulation
 */
#define EXT3_IOC32_GETFLAGS		FS_IOC32_GETFLAGS
#define EXT3_IOC32_SETFLAGS		FS_IOC32_SETFLAGS
#define EXT3_IOC32_GETVERSION		_IOR('f', 3, int)
#define EXT3_IOC32_SETVERSION		_IOW('f', 4, int)
#define EXT3_IOC32_GETRSVSZ		_IOR('f', 5, int)
#define EXT3_IOC32_SETRSVSZ		_IOW('f', 6, int)
#define EXT3_IOC32_GROUP_EXTEND		_IOW('f', 7, unsigned int)
#ifdef CONFIG_JBD_DEBUG
#define EXT3_IOC32_WAIT_FOR_READONLY	_IOR('f', 99, int)
#endif
#define EXT3_IOC32_GETVERSION_OLD	FS_IOC32_GETVERSION
#define EXT3_IOC32_SETVERSION_OLD	FS_IOC32_SETVERSION


/*
 *  Mount options
 */
struct ext3_mount_options {
	unsigned long s_mount_opt;
	uid_t s_resuid;
	gid_t s_resgid;
	unsigned long s_commit_interval;
#ifdef CONFIG_QUOTA
	int s_jquota_fmt;
	char *s_qf_names[MAXQUOTAS];
#endif
};

/*
 * Structure of an inode on the disk
 */
struct ext3_inode {
	__le16	i_mode;		/* File mode */
	__le16	i_uid;		/* Low 16 bits of Owner Uid */
	__le32	i_size;		/* Size in bytes */
	__le32	i_atime;	/* Access time */
	__le32	i_ctime;	/* Creation time */
	__le32	i_mtime;	/* Modification time */
	__le32	i_dtime;	/* Deletion Time */
	__le16	i_gid;		/* Low 16 bits of Group Id */
	__le16	i_links_count;	/* Links count */
	__le32	i_blocks;	/* Blocks count */
	__le32	i_flags;	/* File flags */
	union {
		struct {
			__u32  l_i_reserved1;
		} linux1;
		struct {
			__u32  h_i_translator;
		} hurd1;
		struct {
			__u32  m_i_reserved1;
		} masix1;
	} osd1;				/* OS dependent 1 */
	__le32	i_block[EXT3_N_BLOCKS];/* Pointers to blocks */
	__le32	i_generation;	/* File version (for NFS) */
	__le32	i_file_acl;	/* File ACL */
	__le32	i_dir_acl;	/* Directory ACL */
	__le32	i_faddr;	/* Fragment address */
	union {
		struct {
			__u8	l_i_frag;	/* Fragment number */
			__u8	l_i_fsize;	/* Fragment size */
			__u16	i_pad1;
			__le16	l_i_uid_high;	/* these 2 fields    */
			__le16	l_i_gid_high;	/* were reserved2[0] */
			__u32	l_i_reserved2;
		} linux2;
		struct {
			__u8	h_i_frag;	/* Fragment number */
			__u8	h_i_fsize;	/* Fragment size */
			__u16	h_i_mode_high;
			__u16	h_i_uid_high;
			__u16	h_i_gid_high;
			__u32	h_i_author;
		} hurd2;
		struct {
			__u8	m_i_frag;	/* Fragment number */
			__u8	m_i_fsize;	/* Fragment size */
			__u16	m_pad1;
			__u32	m_i_reserved2[2];
		} masix2;
	} osd2;				/* OS dependent 2 */
	__le16	i_extra_isize;
	__le16	i_pad1;
};

#define i_size_high	i_dir_acl

#if defined(__KERNEL__) || defined(__linux__)
#define i_reserved1	osd1.linux1.l_i_reserved1
#define i_frag		osd2.linux2.l_i_frag
#define i_fsize		osd2.linux2.l_i_fsize
#define i_uid_low	i_uid
#define i_gid_low	i_gid
#define i_uid_high	osd2.linux2.l_i_uid_high
#define i_gid_high	osd2.linux2.l_i_gid_high
#define i_reserved2	osd2.linux2.l_i_reserved2

#elif defined(__GNU__)

#define i_translator	osd1.hurd1.h_i_translator
#define i_frag		osd2.hurd2.h_i_frag;
#define i_fsize		osd2.hurd2.h_i_fsize;
#define i_uid_high	osd2.hurd2.h_i_uid_high
#define i_gid_high	osd2.hurd2.h_i_gid_high
#define i_author	osd2.hurd2.h_i_author

#elif defined(__masix__)

#define i_reserved1	osd1.masix1.m_i_reserved1
#define i_frag		osd2.masix2.m_i_frag
#define i_fsize		osd2.masix2.m_i_fsize
#define i_reserved2	osd2.masix2.m_i_reserved2

#endif /* defined(__KERNEL__) || defined(__linux__) */

/*
 * File system states
 */
#define	EXT3_VALID_FS			0x0001	/* Unmounted cleanly */
#define	EXT3_ERROR_FS			0x0002	/* Errors detected */
#define	EXT3_ORPHAN_FS			0x0004	/* Orphans being recovered */

/*
 * Misc. filesystem flags
 */
#define EXT2_FLAGS_SIGNED_HASH		0x0001  /* Signed dirhash in use */
#define EXT2_FLAGS_UNSIGNED_HASH	0x0002  /* Unsigned dirhash in use */
#define EXT2_FLAGS_TEST_FILESYS		0x0004	/* to test development code */

/*
 * Mount flags
 */
#define EXT3_MOUNT_CHECK		0x00001	/* Do mount-time checks */
#define EXT3_MOUNT_OLDALLOC		0x00002  /* Don't use the new Orlov allocator */
#define EXT3_MOUNT_GRPID		0x00004	/* Create files with directory's group */
#define EXT3_MOUNT_DEBUG		0x00008	/* Some debugging messages */
#define EXT3_MOUNT_ERRORS_CONT		0x00010	/* Continue on errors */
#define EXT3_MOUNT_ERRORS_RO		0x00020	/* Remount fs ro on errors */
#define EXT3_MOUNT_ERRORS_PANIC		0x00040	/* Panic on errors */
#define EXT3_MOUNT_MINIX_DF		0x00080	/* Mimics the Minix statfs */
#define EXT3_MOUNT_NOLOAD		0x00100	/* Don't use existing journal*/
#define EXT3_MOUNT_ABORT		0x00200	/* Fatal error detected */
#define EXT3_MOUNT_DATA_FLAGS		0x00C00	/* Mode for data writes: */
#define EXT3_MOUNT_JOURNAL_DATA		0x00400	/* Write data to journal */
#define EXT3_MOUNT_ORDERED_DATA		0x00800	/* Flush data before commit */
#define EXT3_MOUNT_WRITEBACK_DATA	0x00C00	/* No data ordering */
#define EXT3_MOUNT_UPDATE_JOURNAL	0x01000	/* Update the journal format */
#define EXT3_MOUNT_NO_UID32		0x02000  /* Disable 32-bit UIDs */
#define EXT3_MOUNT_XATTR_USER		0x04000	/* Extended user attributes */
#define EXT3_MOUNT_POSIX_ACL		0x08000	/* POSIX Access Control Lists */
#define EXT3_MOUNT_RESERVATION		0x10000	/* Preallocation */
#define EXT3_MOUNT_BARRIER		0x20000 /* Use block barriers */
#define EXT3_MOUNT_QUOTA		0x80000 /* Some quota option set */
#define EXT3_MOUNT_USRQUOTA		0x100000 /* "old" user quota */
#define EXT3_MOUNT_GRPQUOTA		0x200000 /* "old" group quota */
#define EXT3_MOUNT_DATA_ERR_ABORT	0x400000 /* Abort on file data write
						  * error in ordered mode */

/* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
#ifndef _LINUX_EXT2_FS_H
#define clear_opt(o, opt)		o &= ~EXT3_MOUNT_##opt
#define set_opt(o, opt)			o |= EXT3_MOUNT_##opt
#define test_opt(sb, opt)		(EXT3_SB(sb)->s_mount_opt & \
					 EXT3_MOUNT_##opt)
#else
#define EXT2_MOUNT_NOLOAD		EXT3_MOUNT_NOLOAD
#define EXT2_MOUNT_ABORT		EXT3_MOUNT_ABORT
#define EXT2_MOUNT_DATA_FLAGS		EXT3_MOUNT_DATA_FLAGS
#endif

#define ext3_set_bit			ext2_set_bit
#define ext3_set_bit_atomic		ext2_set_bit_atomic
#define ext3_clear_bit			ext2_clear_bit
#define ext3_clear_bit_atomic		ext2_clear_bit_atomic
#define ext3_test_bit			ext2_test_bit
#define ext3_find_first_zero_bit	ext2_find_first_zero_bit
#define ext3_find_next_zero_bit		ext2_find_next_zero_bit

/*
 * Maximal mount counts between two filesystem checks
 */
#define EXT3_DFL_MAX_MNT_COUNT		20	/* Allow 20 mounts */
#define EXT3_DFL_CHECKINTERVAL		0	/* Don't use interval check */

/*
 * Behaviour when detecting errors
 */
#define EXT3_ERRORS_CONTINUE		1	/* Continue execution */
#define EXT3_ERRORS_RO			2	/* Remount fs read-only */
#define EXT3_ERRORS_PANIC		3	/* Panic */
#define EXT3_ERRORS_DEFAULT		EXT3_ERRORS_CONTINUE

/*
 * Structure of the super block
 */
struct ext3_super_block {
/*00*/	__le32	s_inodes_count;		/* Inodes count */
	__le32	s_blocks_count;		/* Blocks count */
	__le32	s_r_blocks_count;	/* Reserved blocks count */
	__le32	s_free_blocks_count;	/* Free blocks count */
/*10*/	__le32	s_free_inodes_count;	/* Free inodes count */
	__le32	s_first_data_block;	/* First Data Block */
	__le32	s_log_block_size;	/* Block size */
	__le32	s_log_frag_size;	/* Fragment size */
/*20*/	__le32	s_blocks_per_group;	/* # Blocks per group */
	__le32	s_frags_per_group;	/* # Fragments per group */
	__le32	s_inodes_per_group;	/* # Inodes per group */
	__le32	s_mtime;		/* Mount time */
/*30*/	__le32	s_wtime;		/* Write time */
	__le16	s_mnt_count;		/* Mount count */
	__le16	s_max_mnt_count;	/* Maximal mount count */
	__le16	s_magic;		/* Magic signature */
	__le16	s_state;		/* File system state */
	__le16	s_errors;		/* Behaviour when detecting errors */
	__le16	s_minor_rev_level;	/* minor revision level */
/*40*/	__le32	s_lastcheck;		/* time of last check */
	__le32	s_checkinterval;	/* max. time between checks */
	__le32	s_creator_os;		/* OS */
	__le32	s_rev_level;		/* Revision level */
/*50*/	__le16	s_def_resuid;		/* Default uid for reserved blocks */
	__le16	s_def_resgid;		/* Default gid for reserved blocks */
	/*
	 * These fields are for EXT3_DYNAMIC_REV superblocks only.
	 *
	 * Note: the difference between the compatible feature set and
	 * the incompatible feature set is that if there is a bit set
	 * in the incompatible feature set that the kernel doesn't
	 * know about, it should refuse to mount the filesystem.
	 *
	 * e2fsck's requirements are more strict; if it doesn't know
	 * about a feature in either the compatible or incompatible
	 * feature set, it must abort and not try to meddle with
	 * things it doesn't understand...
	 */
	__le32	s_first_ino;		/* First non-reserved inode */
	__le16   s_inode_size;		/* size of inode structure */
	__le16	s_block_group_nr;	/* block group # of this superblock */
	__le32	s_feature_compat;	/* compatible feature set */
/*60*/	__le32	s_feature_incompat;	/* incompatible feature set */
	__le32	s_feature_ro_compat;	/* readonly-compatible feature set */
/*68*/	__u8	s_uuid[16];		/* 128-bit uuid for volume */
/*78*/	char	s_volume_name[16];	/* volume name */
/*88*/	char	s_last_mounted[64];	/* directory where last mounted */
/*C8*/	__le32	s_algorithm_usage_bitmap; /* For compression */
	/*
	 * Performance hints.  Directory preallocation should only
	 * happen if the EXT3_FEATURE_COMPAT_DIR_PREALLOC flag is on.
	 */
	__u8	s_prealloc_blocks;	/* Nr of blocks to try to preallocate*/
	__u8	s_prealloc_dir_blocks;	/* Nr to preallocate for dirs */
	__le16	s_reserved_gdt_blocks;	/* Per group desc for online growth */
	/*
	 * Journaling support valid if EXT3_FEATURE_COMPAT_HAS_JOURNAL set.
	 */
/*D0*/	__u8	s_journal_uuid[16];	/* uuid of journal superblock */
/*E0*/	__le32	s_journal_inum;		/* inode number of journal file */
	__le32	s_journal_dev;		/* device number of journal file */
	__le32	s_last_orphan;		/* start of list of inodes to delete */
	__le32	s_hash_seed[4];		/* HTREE hash seed */
	__u8	s_def_hash_version;	/* Default hash version to use */
	__u8	s_reserved_char_pad;
	__u16	s_reserved_word_pad;
	__le32	s_default_mount_opts;
	__le32	s_first_meta_bg;	/* First metablock block group */
	__le32	s_mkfs_time;		/* When the filesystem was created */
	__le32	s_jnl_blocks[17];	/* Backup of the journal inode */
	/* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */
/*150*/	__le32	s_blocks_count_hi;	/* Blocks count */
	__le32	s_r_blocks_count_hi;	/* Reserved blocks count */
	__le32	s_free_blocks_count_hi;	/* Free blocks count */
	__le16	s_min_extra_isize;	/* All inodes have at least # bytes */
	__le16	s_want_extra_isize; 	/* New inodes should reserve # bytes */
	__le32	s_flags;		/* Miscellaneous flags */
	__le16  s_raid_stride;		/* RAID stride */
	__le16  s_mmp_interval;         /* # seconds to wait in MMP checking */
	__le64  s_mmp_block;            /* Block for multi-mount protection */
	__le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
	__u8	s_log_groups_per_flex;  /* FLEX_BG group size */
	__u8	s_reserved_char_pad2;
	__le16  s_reserved_pad;
	__u32   s_reserved[162];        /* Padding to the end of the block */
};

#ifdef __KERNEL__
#include <linux/ext3_fs_i.h>
#include <linux/ext3_fs_sb.h>
static inline struct ext3_sb_info * EXT3_SB(struct super_block *sb)
{
	return sb->s_fs_info;
}
static inline struct ext3_inode_info *EXT3_I(struct inode *inode)
{
	return container_of(inode, struct ext3_inode_info, vfs_inode);
}

static inline int ext3_valid_inum(struct super_block *sb, unsigned long ino)
{
	return ino == EXT3_ROOT_INO ||
		ino == EXT3_JOURNAL_INO ||
		ino == EXT3_RESIZE_INO ||
		(ino >= EXT3_FIRST_INO(sb) &&
		 ino <= le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count));
}

/*
 * Inode dynamic state flags
 */
enum {
	EXT3_STATE_JDATA,		/* journaled data exists */
	EXT3_STATE_NEW,			/* inode is newly created */
	EXT3_STATE_XATTR,		/* has in-inode xattrs */
	EXT3_STATE_FLUSH_ON_CLOSE,	/* flush dirty pages on close */
};

static inline int ext3_test_inode_state(struct inode *inode, int bit)
{
	return test_bit(bit, &EXT3_I(inode)->i_state_flags);
}

static inline void ext3_set_inode_state(struct inode *inode, int bit)
{
	set_bit(bit, &EXT3_I(inode)->i_state_flags);
}

static inline void ext3_clear_inode_state(struct inode *inode, int bit)
{
	clear_bit(bit, &EXT3_I(inode)->i_state_flags);
}
#else
/* Assume that user mode programs are passing in an ext3fs superblock, not
 * a kernel struct super_block.  This will allow us to call the feature-test
 * macros from user land. */
#define EXT3_SB(sb)	(sb)
#endif

#define NEXT_ORPHAN(inode) EXT3_I(inode)->i_dtime

/*
 * Codes for operating systems
 */
#define EXT3_OS_LINUX		0
#define EXT3_OS_HURD		1
#define EXT3_OS_MASIX		2
#define EXT3_OS_FREEBSD		3
#define EXT3_OS_LITES		4

/*
 * Revision levels
 */
#define EXT3_GOOD_OLD_REV	0	/* The good old (original) format */
#define EXT3_DYNAMIC_REV	1	/* V2 format w/ dynamic inode sizes */

#define EXT3_CURRENT_REV	EXT3_GOOD_OLD_REV
#define EXT3_MAX_SUPP_REV	EXT3_DYNAMIC_REV

#define EXT3_GOOD_OLD_INODE_SIZE 128

/*
 * Feature set definitions
 */

#define EXT3_HAS_COMPAT_FEATURE(sb,mask)			\
	( EXT3_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) )
#define EXT3_HAS_RO_COMPAT_FEATURE(sb,mask)			\
	( EXT3_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) )
#define EXT3_HAS_INCOMPAT_FEATURE(sb,mask)			\
	( EXT3_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) )
#define EXT3_SET_COMPAT_FEATURE(sb,mask)			\
	EXT3_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
#define EXT3_SET_RO_COMPAT_FEATURE(sb,mask)			\
	EXT3_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask)
#define EXT3_SET_INCOMPAT_FEATURE(sb,mask)			\
	EXT3_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask)
#define EXT3_CLEAR_COMPAT_FEATURE(sb,mask)			\
	EXT3_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask)
#define EXT3_CLEAR_RO_COMPAT_FEATURE(sb,mask)			\
	EXT3_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask)
#define EXT3_CLEAR_INCOMPAT_FEATURE(sb,mask)			\
	EXT3_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask)

#define EXT3_FEATURE_COMPAT_DIR_PREALLOC	0x0001
#define EXT3_FEATURE_COMPAT_IMAGIC_INODES	0x0002
#define EXT3_FEATURE_COMPAT_HAS_JOURNAL		0x0004
#define EXT3_FEATURE_COMPAT_EXT_ATTR		0x0008
#define EXT3_FEATURE_COMPAT_RESIZE_INODE	0x0010
#define EXT3_FEATURE_COMPAT_DIR_INDEX		0x0020

#define EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER	0x0001
#define EXT3_FEATURE_RO_COMPAT_LARGE_FILE	0x0002
#define EXT3_FEATURE_RO_COMPAT_BTREE_DIR	0x0004

#define EXT3_FEATURE_INCOMPAT_COMPRESSION	0x0001
#define EXT3_FEATURE_INCOMPAT_FILETYPE		0x0002
#define EXT3_FEATURE_INCOMPAT_RECOVER		0x0004 /* Needs recovery */
#define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV	0x0008 /* Journal device */
#define EXT3_FEATURE_INCOMPAT_META_BG		0x0010

#define EXT3_FEATURE_COMPAT_SUPP	EXT2_FEATURE_COMPAT_EXT_ATTR
#define EXT3_FEATURE_INCOMPAT_SUPP	(EXT3_FEATURE_INCOMPAT_FILETYPE| \
					 EXT3_FEATURE_INCOMPAT_RECOVER| \
					 EXT3_FEATURE_INCOMPAT_META_BG)
#define EXT3_FEATURE_RO_COMPAT_SUPP	(EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \
					 EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \
					 EXT3_FEATURE_RO_COMPAT_BTREE_DIR)

/*
 * Default values for user and/or group using reserved blocks
 */
#define	EXT3_DEF_RESUID		0
#define	EXT3_DEF_RESGID		0

/*
 * Default mount options
 */
#define EXT3_DEFM_DEBUG		0x0001
#define EXT3_DEFM_BSDGROUPS	0x0002
#define EXT3_DEFM_XATTR_USER	0x0004
#define EXT3_DEFM_ACL		0x0008
#define EXT3_DEFM_UID16		0x0010
#define EXT3_DEFM_JMODE		0x0060
#define EXT3_DEFM_JMODE_DATA	0x0020
#define EXT3_DEFM_JMODE_ORDERED	0x0040
#define EXT3_DEFM_JMODE_WBACK	0x0060

/*
 * Structure of a directory entry
 */
#define EXT3_NAME_LEN 255

struct ext3_dir_entry {
	__le32	inode;			/* Inode number */
	__le16	rec_len;		/* Directory entry length */
	__le16	name_len;		/* Name length */
	char	name[EXT3_NAME_LEN];	/* File name */
};

/*
 * The new version of the directory entry.  Since EXT3 structures are
 * stored in intel byte order, and the name_len field could never be
 * bigger than 255 chars, it's safe to reclaim the extra byte for the
 * file_type field.
 */
struct ext3_dir_entry_2 {
	__le32	inode;			/* Inode number */
	__le16	rec_len;		/* Directory entry length */
	__u8	name_len;		/* Name length */
	__u8	file_type;
	char	name[EXT3_NAME_LEN];	/* File name */
};

/*
 * Ext3 directory file types.  Only the low 3 bits are used.  The
 * other bits are reserved for now.
 */
#define EXT3_FT_UNKNOWN		0
#define EXT3_FT_REG_FILE	1
#define EXT3_FT_DIR		2
#define EXT3_FT_CHRDEV		3
#define EXT3_FT_BLKDEV		4
#define EXT3_FT_FIFO		5
#define EXT3_FT_SOCK		6
#define EXT3_FT_SYMLINK		7

#define EXT3_FT_MAX		8

/*
 * EXT3_DIR_PAD defines the directory entries boundaries
 *
 * NOTE: It must be a multiple of 4
 */
#define EXT3_DIR_PAD			4
#define EXT3_DIR_ROUND			(EXT3_DIR_PAD - 1)
#define EXT3_DIR_REC_LEN(name_len)	(((name_len) + 8 + EXT3_DIR_ROUND) & \
					 ~EXT3_DIR_ROUND)
#define EXT3_MAX_REC_LEN		((1<<16)-1)

static inline unsigned ext3_rec_len_from_disk(__le16 dlen)
{
	unsigned len = le16_to_cpu(dlen);

	if (len == EXT3_MAX_REC_LEN)
		return 1 << 16;
	return len;
}

static inline __le16 ext3_rec_len_to_disk(unsigned len)
{
	if (len == (1 << 16))
		return cpu_to_le16(EXT3_MAX_REC_LEN);
	else if (len > (1 << 16))
		BUG();
	return cpu_to_le16(len);
}

/*
 * Hash Tree Directory indexing
 * (c) Daniel Phillips, 2001
 */

#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
				      EXT3_FEATURE_COMPAT_DIR_INDEX) && \
		      (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)

/* Legal values for the dx_root hash_version field: */

#define DX_HASH_LEGACY		0
#define DX_HASH_HALF_MD4	1
#define DX_HASH_TEA		2
#define DX_HASH_LEGACY_UNSIGNED	3
#define DX_HASH_HALF_MD4_UNSIGNED	4
#define DX_HASH_TEA_UNSIGNED		5

#ifdef __KERNEL__

/* hash info structure used by the directory hash */
struct dx_hash_info
{
	u32		hash;
	u32		minor_hash;
	int		hash_version;
	u32		*seed;
};

#define EXT3_HTREE_EOF	0x7fffffff

/*
 * Control parameters used by ext3_htree_next_block
 */
#define HASH_NB_ALWAYS		1


/*
 * Describe an inode's exact location on disk and in memory
 */
struct ext3_iloc
{
	struct buffer_head *bh;
	unsigned long offset;
	unsigned long block_group;
};

static inline struct ext3_inode *ext3_raw_inode(struct ext3_iloc *iloc)
{
	return (struct ext3_inode *) (iloc->bh->b_data + iloc->offset);
}

/*
 * This structure is stuffed into the struct file's private_data field
 * for directories.  It is where we put information so that we can do
 * readdir operations in hash tree order.
 */
struct dir_private_info {
	struct rb_root	root;
	struct rb_node	*curr_node;
	struct fname	*extra_fname;
	loff_t		last_pos;
	__u32		curr_hash;
	__u32		curr_minor_hash;
	__u32		next_hash;
};

/* calculate the first block number of the group */
static inline ext3_fsblk_t
ext3_group_first_block_no(struct super_block *sb, unsigned long group_no)
{
	return group_no * (ext3_fsblk_t)EXT3_BLOCKS_PER_GROUP(sb) +
		le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block);
}

/*
 * Special error return code only used by dx_probe() and its callers.
 */
#define ERR_BAD_DX_DIR	-75000

/*
 * Function prototypes
 */

/*
 * Ok, these declarations are also in <linux/kernel.h> but none of the
 * ext3 source programs needs to include it so they are duplicated here.
 */
# define NORET_TYPE    /**/
# define ATTRIB_NORET  __attribute__((noreturn))
# define NORET_AND     noreturn,

/* balloc.c */
extern int ext3_bg_has_super(struct super_block *sb, int group);
extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode,
			ext3_fsblk_t goal, int *errp);
extern ext3_fsblk_t ext3_new_blocks (handle_t *handle, struct inode *inode,
			ext3_fsblk_t goal, unsigned long *count, int *errp);
extern void ext3_free_blocks (handle_t *handle, struct inode *inode,
			ext3_fsblk_t block, unsigned long count);
extern void ext3_free_blocks_sb (handle_t *handle, struct super_block *sb,
				 ext3_fsblk_t block, unsigned long count,
				unsigned long *pdquot_freed_blocks);
extern ext3_fsblk_t ext3_count_free_blocks (struct super_block *);
extern void ext3_check_blocks_bitmap (struct super_block *);
extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
						    unsigned int block_group,
						    struct buffer_head ** bh);
extern int ext3_should_retry_alloc(struct super_block *sb, int *retries);
extern void ext3_init_block_alloc_info(struct inode *);
extern void ext3_rsv_window_add(struct super_block *sb, struct ext3_reserve_window_node *rsv);

/* dir.c */
extern int ext3_check_dir_entry(const char *, struct inode *,
				struct ext3_dir_entry_2 *,
				struct buffer_head *, unsigned long);
extern int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
				    __u32 minor_hash,
				    struct ext3_dir_entry_2 *dirent);
extern void ext3_htree_free_dir_info(struct dir_private_info *p);

/* fsync.c */
extern int ext3_sync_file(struct file *, int);

/* hash.c */
extern int ext3fs_dirhash(const char *name, int len, struct
			  dx_hash_info *hinfo);

/* ialloc.c */
extern struct inode * ext3_new_inode (handle_t *, struct inode *, int);
extern void ext3_free_inode (handle_t *, struct inode *);
extern struct inode * ext3_orphan_get (struct super_block *, unsigned long);
extern unsigned long ext3_count_free_inodes (struct super_block *);
extern unsigned long ext3_count_dirs (struct super_block *);
extern void ext3_check_inodes_bitmap (struct super_block *);
extern unsigned long ext3_count_free (struct buffer_head *, unsigned);


/* inode.c */
int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
		struct buffer_head *bh, ext3_fsblk_t blocknr);
struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
	sector_t iblock, unsigned long maxblocks, struct buffer_head *bh_result,
	int create);

extern struct inode *ext3_iget(struct super_block *, unsigned long);
extern int  ext3_write_inode (struct inode *, struct writeback_control *);
extern int  ext3_setattr (struct dentry *, struct iattr *);
extern void ext3_evict_inode (struct inode *);
extern int  ext3_sync_inode (handle_t *, struct inode *);
extern void ext3_discard_reservation (struct inode *);
extern void ext3_dirty_inode(struct inode *);
extern int ext3_change_inode_journal_flag(struct inode *, int);
extern int ext3_get_inode_loc(struct inode *, struct ext3_iloc *);
extern int ext3_can_truncate(struct inode *inode);
extern void ext3_truncate (struct inode *);
extern void ext3_set_inode_flags(struct inode *);
extern void ext3_get_inode_flags(struct ext3_inode_info *);
extern void ext3_set_aops(struct inode *inode);
extern int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
		       u64 start, u64 len);

/* ioctl.c */
extern long ext3_ioctl(struct file *, unsigned int, unsigned long);
extern long ext3_compat_ioctl(struct file *, unsigned int, unsigned long);

/* namei.c */
extern int ext3_orphan_add(handle_t *, struct inode *);
extern int ext3_orphan_del(handle_t *, struct inode *);
extern int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
				__u32 start_minor_hash, __u32 *next_hash);

/* resize.c */
extern int ext3_group_add(struct super_block *sb,
				struct ext3_new_group_data *input);
extern int ext3_group_extend(struct super_block *sb,
				struct ext3_super_block *es,
				ext3_fsblk_t n_blocks_count);

/* super.c */
extern void ext3_error (struct super_block *, const char *, const char *, ...)
	__attribute__ ((format (printf, 3, 4)));
extern void __ext3_std_error (struct super_block *, const char *, int);
extern void ext3_abort (struct super_block *, const char *, const char *, ...)
	__attribute__ ((format (printf, 3, 4)));
extern void ext3_warning (struct super_block *, const char *, const char *, ...)
	__attribute__ ((format (printf, 3, 4)));
extern void ext3_msg(struct super_block *, const char *, const char *, ...)
	__attribute__ ((format (printf, 3, 4)));
extern void ext3_update_dynamic_rev (struct super_block *sb);

#define ext3_std_error(sb, errno)				\
do {								\
	if ((errno))						\
		__ext3_std_error((sb), __func__, (errno));	\
} while (0)

/*
 * Inodes and files operations
 */

/* dir.c */
extern const struct file_operations ext3_dir_operations;

/* file.c */
extern const struct inode_operations ext3_file_inode_operations;
extern const struct file_operations ext3_file_operations;

/* namei.c */
extern const struct inode_operations ext3_dir_inode_operations;
extern const struct inode_operations ext3_special_inode_operations;

/* symlink.c */
extern const struct inode_operations ext3_symlink_inode_operations;
extern const struct inode_operations ext3_fast_symlink_inode_operations;


#endif	/* __KERNEL__ */

#endif	/* _LINUX_EXT3_FS_H */
ss="hl opt">; static const struct inode_operations shmem_dir_inode_operations; static const struct inode_operations shmem_special_inode_operations; static struct vm_operations_struct shmem_vm_ops; static struct backing_dev_info shmem_backing_dev_info __read_mostly = { .ra_pages = 0, /* No readahead */ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, .unplug_io_fn = default_unplug_io_fn, }; static LIST_HEAD(shmem_swaplist); static DEFINE_MUTEX(shmem_swaplist_mutex); static void shmem_free_blocks(struct inode *inode, long pages) { struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); if (sbinfo->max_blocks) { spin_lock(&sbinfo->stat_lock); sbinfo->free_blocks += pages; inode->i_blocks -= pages*BLOCKS_PER_PAGE; spin_unlock(&sbinfo->stat_lock); } } static int shmem_reserve_inode(struct super_block *sb) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); if (sbinfo->max_inodes) { spin_lock(&sbinfo->stat_lock); if (!sbinfo->free_inodes) { spin_unlock(&sbinfo->stat_lock); return -ENOSPC; } sbinfo->free_inodes--; spin_unlock(&sbinfo->stat_lock); } return 0; } static void shmem_free_inode(struct super_block *sb) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); if (sbinfo->max_inodes) { spin_lock(&sbinfo->stat_lock); sbinfo->free_inodes++; spin_unlock(&sbinfo->stat_lock); } } /** * shmem_recalc_inode - recalculate the size of an inode * @inode: inode to recalc * * We have to calculate the free blocks since the mm can drop * undirtied hole pages behind our back. * * But normally info->alloced == inode->i_mapping->nrpages + info->swapped * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) * * It has to be called with the spinlock held. */ static void shmem_recalc_inode(struct inode *inode) { struct shmem_inode_info *info = SHMEM_I(inode); long freed; freed = info->alloced - info->swapped - inode->i_mapping->nrpages; if (freed > 0) { info->alloced -= freed; shmem_unacct_blocks(info->flags, freed); shmem_free_blocks(inode, freed); } } /** * shmem_swp_entry - find the swap vector position in the info structure * @info: info structure for the inode * @index: index of the page to find * @page: optional page to add to the structure. Has to be preset to * all zeros * * If there is no space allocated yet it will return NULL when * page is NULL, else it will use the page for the needed block, * setting it to NULL on return to indicate that it has been used. * * The swap vector is organized the following way: * * There are SHMEM_NR_DIRECT entries directly stored in the * shmem_inode_info structure. So small files do not need an addional * allocation. * * For pages with index > SHMEM_NR_DIRECT there is the pointer * i_indirect which points to a page which holds in the first half * doubly indirect blocks, in the second half triple indirect blocks: * * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the * following layout (for SHMEM_NR_DIRECT == 16): * * i_indirect -> dir --> 16-19 * | +-> 20-23 * | * +-->dir2 --> 24-27 * | +-> 28-31 * | +-> 32-35 * | +-> 36-39 * | * +-->dir3 --> 40-43 * +-> 44-47 * +-> 48-51 * +-> 52-55 */ static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page) { unsigned long offset; struct page **dir; struct page *subdir; if (index < SHMEM_NR_DIRECT) { shmem_swp_balance_unmap(); return info->i_direct+index; } if (!info->i_indirect) { if (page) { info->i_indirect = *page; *page = NULL; } return NULL; /* need another page */ } index -= SHMEM_NR_DIRECT; offset = index % ENTRIES_PER_PAGE; index /= ENTRIES_PER_PAGE; dir = shmem_dir_map(info->i_indirect); if (index >= ENTRIES_PER_PAGE/2) { index -= ENTRIES_PER_PAGE/2; dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE; index %= ENTRIES_PER_PAGE; subdir = *dir; if (!subdir) { if (page) { *dir = *page; *page = NULL; } shmem_dir_unmap(dir); return NULL; /* need another page */ } shmem_dir_unmap(dir); dir = shmem_dir_map(subdir); } dir += index; subdir = *dir; if (!subdir) { if (!page || !(subdir = *page)) { shmem_dir_unmap(dir); return NULL; /* need a page */ } *dir = subdir; *page = NULL; } shmem_dir_unmap(dir); return shmem_swp_map(subdir) + offset; } static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value) { long incdec = value? 1: -1; entry->val = value; info->swapped += incdec; if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) { struct page *page = kmap_atomic_to_page(entry); set_page_private(page, page_private(page) + incdec); } } /** * shmem_swp_alloc - get the position of the swap entry for the page. * @info: info structure for the inode * @index: index of the page to find * @sgp: check and recheck i_size? skip allocation? * * If the entry does not exist, allocate it. */ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp) { struct inode *inode = &info->vfs_inode; struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct page *page = NULL; swp_entry_t *entry; if (sgp != SGP_WRITE && ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) return ERR_PTR(-EINVAL); while (!(entry = shmem_swp_entry(info, index, &page))) { if (sgp == SGP_READ) return shmem_swp_map(ZERO_PAGE(0)); /* * Test free_blocks against 1 not 0, since we have 1 data * page (and perhaps indirect index pages) yet to allocate: * a waste to allocate index if we cannot allocate data. */ if (sbinfo->max_blocks) { spin_lock(&sbinfo->stat_lock); if (sbinfo->free_blocks <= 1) { spin_unlock(&sbinfo->stat_lock); return ERR_PTR(-ENOSPC); } sbinfo->free_blocks--; inode->i_blocks += BLOCKS_PER_PAGE; spin_unlock(&sbinfo->stat_lock); } spin_unlock(&info->lock); page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping)); if (page) set_page_private(page, 0); spin_lock(&info->lock); if (!page) { shmem_free_blocks(inode, 1); return ERR_PTR(-ENOMEM); } if (sgp != SGP_WRITE && ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { entry = ERR_PTR(-EINVAL); break; } if (info->next_index <= index) info->next_index = index + 1; } if (page) { /* another task gave its page, or truncated the file */ shmem_free_blocks(inode, 1); shmem_dir_free(page); } if (info->next_index <= index && !IS_ERR(entry)) info->next_index = index + 1; return entry; } /** * shmem_free_swp - free some swap entries in a directory * @dir: pointer to the directory * @edir: pointer after last entry of the directory * @punch_lock: pointer to spinlock when needed for the holepunch case */ static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir, spinlock_t *punch_lock) { spinlock_t *punch_unlock = NULL; swp_entry_t *ptr; int freed = 0; for (ptr = dir; ptr < edir; ptr++) { if (ptr->val) { if (unlikely(punch_lock)) { punch_unlock = punch_lock; punch_lock = NULL; spin_lock(punch_unlock); if (!ptr->val) continue; } free_swap_and_cache(*ptr); *ptr = (swp_entry_t){0}; freed++; } } if (punch_unlock) spin_unlock(punch_unlock); return freed; } static int shmem_map_and_free_swp(struct page *subdir, int offset, int limit, struct page ***dir, spinlock_t *punch_lock) { swp_entry_t *ptr; int freed = 0; ptr = shmem_swp_map(subdir); for (; offset < limit; offset += LATENCY_LIMIT) { int size = limit - offset; if (size > LATENCY_LIMIT) size = LATENCY_LIMIT; freed += shmem_free_swp(ptr+offset, ptr+offset+size, punch_lock); if (need_resched()) { shmem_swp_unmap(ptr); if (*dir) { shmem_dir_unmap(*dir); *dir = NULL; } cond_resched(); ptr = shmem_swp_map(subdir); } } shmem_swp_unmap(ptr); return freed; } static void shmem_free_pages(struct list_head *next) { struct page *page; int freed = 0; do { page = container_of(next, struct page, lru); next = next->next; shmem_dir_free(page); freed++; if (freed >= LATENCY_LIMIT) { cond_resched(); freed = 0; } } while (next); } static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) { struct shmem_inode_info *info = SHMEM_I(inode); unsigned long idx; unsigned long size; unsigned long limit; unsigned long stage; unsigned long diroff; struct page **dir; struct page *topdir; struct page *middir; struct page *subdir; swp_entry_t *ptr; LIST_HEAD(pages_to_free); long nr_pages_to_free = 0; long nr_swaps_freed = 0; int offset; int freed; int punch_hole; spinlock_t *needs_lock; spinlock_t *punch_lock; unsigned long upper_limit; inode->i_ctime = inode->i_mtime = CURRENT_TIME; idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (idx >= info->next_index) return; spin_lock(&info->lock); info->flags |= SHMEM_TRUNCATE; if (likely(end == (loff_t) -1)) { limit = info->next_index; upper_limit = SHMEM_MAX_INDEX; info->next_index = idx; needs_lock = NULL; punch_hole = 0; } else { if (end + 1 >= inode->i_size) { /* we may free a little more */ limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; upper_limit = SHMEM_MAX_INDEX; } else { limit = (end + 1) >> PAGE_CACHE_SHIFT; upper_limit = limit; } needs_lock = &info->lock; punch_hole = 1; } topdir = info->i_indirect; if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) { info->i_indirect = NULL; nr_pages_to_free++; list_add(&topdir->lru, &pages_to_free); } spin_unlock(&info->lock); if (info->swapped && idx < SHMEM_NR_DIRECT) { ptr = info->i_direct; size = limit; if (size > SHMEM_NR_DIRECT) size = SHMEM_NR_DIRECT; nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock); } /* * If there are no indirect blocks or we are punching a hole * below indirect blocks, nothing to be done. */ if (!topdir || limit <= SHMEM_NR_DIRECT) goto done2; /* * The truncation case has already dropped info->lock, and we're safe * because i_size and next_index have already been lowered, preventing * access beyond. But in the punch_hole case, we still need to take * the lock when updating the swap directory, because there might be * racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or * shmem_writepage. However, whenever we find we can remove a whole * directory page (not at the misaligned start or end of the range), * we first NULLify its pointer in the level above, and then have no * need to take the lock when updating its contents: needs_lock and * punch_lock (either pointing to info->lock or NULL) manage this. */ upper_limit -= SHMEM_NR_DIRECT; limit -= SHMEM_NR_DIRECT; idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0; offset = idx % ENTRIES_PER_PAGE; idx -= offset; dir = shmem_dir_map(topdir); stage = ENTRIES_PER_PAGEPAGE/2; if (idx < ENTRIES_PER_PAGEPAGE/2) { middir = topdir; diroff = idx/ENTRIES_PER_PAGE; } else { dir += ENTRIES_PER_PAGE/2; dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE; while (stage <= idx) stage += ENTRIES_PER_PAGEPAGE; middir = *dir; if (*dir) { diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) % ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE; if (!diroff && !offset && upper_limit >= stage) { if (needs_lock) { spin_lock(needs_lock); *dir = NULL; spin_unlock(needs_lock); needs_lock = NULL; } else *dir = NULL; nr_pages_to_free++; list_add(&middir->lru, &pages_to_free); } shmem_dir_unmap(dir); dir = shmem_dir_map(middir); } else { diroff = 0; offset = 0; idx = stage; } } for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) { if (unlikely(idx == stage)) { shmem_dir_unmap(dir); dir = shmem_dir_map(topdir) + ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE; while (!*dir) { dir++; idx += ENTRIES_PER_PAGEPAGE; if (idx >= limit) goto done1; } stage = idx + ENTRIES_PER_PAGEPAGE; middir = *dir; if (punch_hole) needs_lock = &info->lock; if (upper_limit >= stage) { if (needs_lock) { spin_lock(needs_lock); *dir = NULL; spin_unlock(needs_lock); needs_lock = NULL; } else *dir = NULL; nr_pages_to_free++; list_add(&middir->lru, &pages_to_free); } shmem_dir_unmap(dir); cond_resched(); dir = shmem_dir_map(middir); diroff = 0; } punch_lock = needs_lock; subdir = dir[diroff]; if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) { if (needs_lock) { spin_lock(needs_lock); dir[diroff] = NULL; spin_unlock(needs_lock); punch_lock = NULL; } else dir[diroff] = NULL; nr_pages_to_free++; list_add(&subdir->lru, &pages_to_free); } if (subdir && page_private(subdir) /* has swap entries */) { size = limit - idx; if (size > ENTRIES_PER_PAGE) size = ENTRIES_PER_PAGE; freed = shmem_map_and_free_swp(subdir, offset, size, &dir, punch_lock); if (!dir) dir = shmem_dir_map(middir); nr_swaps_freed += freed; if (offset || punch_lock) { spin_lock(&info->lock); set_page_private(subdir, page_private(subdir) - freed); spin_unlock(&info->lock); } else BUG_ON(page_private(subdir) != freed); } offset = 0; } done1: shmem_dir_unmap(dir); done2: if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) { /* * Call truncate_inode_pages again: racing shmem_unuse_inode * may have swizzled a page in from swap since vmtruncate or * generic_delete_inode did it, before we lowered next_index. * Also, though shmem_getpage checks i_size before adding to * cache, no recheck after: so fix the narrow window there too. * * Recalling truncate_inode_pages_range and unmap_mapping_range * every time for punch_hole (which never got a chance to clear * SHMEM_PAGEIN at the start of vmtruncate_range) is expensive, * yet hardly ever necessary: try to optimize them out later. */ truncate_inode_pages_range(inode->i_mapping, start, end); if (punch_hole) unmap_mapping_range(inode->i_mapping, start, end - start, 1); } spin_lock(&info->lock); info->flags &= ~SHMEM_TRUNCATE; info->swapped -= nr_swaps_freed; if (nr_pages_to_free) shmem_free_blocks(inode, nr_pages_to_free); shmem_recalc_inode(inode); spin_unlock(&info->lock); /* * Empty swap vector directory pages to be freed? */ if (!list_empty(&pages_to_free)) { pages_to_free.prev->next = NULL; shmem_free_pages(pages_to_free.next); } } static void shmem_truncate(struct inode *inode) { shmem_truncate_range(inode, inode->i_size, (loff_t)-1); } static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; struct page *page = NULL; int error; if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { if (attr->ia_size < inode->i_size) { /* * If truncating down to a partial page, then * if that page is already allocated, hold it * in memory until the truncation is over, so * truncate_partial_page cannnot miss it were * it assigned to swap. */ if (attr->ia_size & (PAGE_CACHE_SIZE-1)) { (void) shmem_getpage(inode, attr->ia_size>>PAGE_CACHE_SHIFT, &page, SGP_READ, NULL); if (page) unlock_page(page); } /* * Reset SHMEM_PAGEIN flag so that shmem_truncate can * detect if any pages might have been added to cache * after truncate_inode_pages. But we needn't bother * if it's being fully truncated to zero-length: the * nrpages check is efficient enough in that case. */ if (attr->ia_size) { struct shmem_inode_info *info = SHMEM_I(inode); spin_lock(&info->lock); info->flags &= ~SHMEM_PAGEIN; spin_unlock(&info->lock); } } } error = inode_change_ok(inode, attr); if (!error) error = inode_setattr(inode, attr); #ifdef CONFIG_TMPFS_POSIX_ACL if (!error && (attr->ia_valid & ATTR_MODE)) error = generic_acl_chmod(inode, &shmem_acl_ops); #endif if (page) page_cache_release(page); return error; } static void shmem_delete_inode(struct inode *inode) { struct shmem_inode_info *info = SHMEM_I(inode); if (inode->i_op->truncate == shmem_truncate) { truncate_inode_pages(inode->i_mapping, 0); shmem_unacct_size(info->flags, inode->i_size); inode->i_size = 0; shmem_truncate(inode); if (!list_empty(&info->swaplist)) { mutex_lock(&shmem_swaplist_mutex); list_del_init(&info->swaplist); mutex_unlock(&shmem_swaplist_mutex); } } BUG_ON(inode->i_blocks); shmem_free_inode(inode->i_sb); clear_inode(inode); } static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir) { swp_entry_t *ptr; for (ptr = dir; ptr < edir; ptr++) { if (ptr->val == entry.val) return ptr - dir; } return -1; } static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page) { struct inode *inode; unsigned long idx; unsigned long size; unsigned long limit; unsigned long stage; struct page **dir; struct page *subdir; swp_entry_t *ptr; int offset; int error; idx = 0; ptr = info->i_direct; spin_lock(&info->lock); if (!info->swapped) { list_del_init(&info->swaplist); goto lost2; } limit = info->next_index; size = limit; if (size > SHMEM_NR_DIRECT) size = SHMEM_NR_DIRECT; offset = shmem_find_swp(entry, ptr, ptr+size); if (offset >= 0) goto found; if (!info->i_indirect) goto lost2; dir = shmem_dir_map(info->i_indirect); stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2; for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) { if (unlikely(idx == stage)) { shmem_dir_unmap(dir-1); if (cond_resched_lock(&info->lock)) { /* check it has not been truncated */ if (limit > info->next_index) { limit = info->next_index; if (idx >= limit) goto lost2; } } dir = shmem_dir_map(info->i_indirect) + ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE; while (!*dir) { dir++; idx += ENTRIES_PER_PAGEPAGE; if (idx >= limit) goto lost1; } stage = idx + ENTRIES_PER_PAGEPAGE; subdir = *dir; shmem_dir_unmap(dir); dir = shmem_dir_map(subdir); } subdir = *dir; if (subdir && page_private(subdir)) { ptr = shmem_swp_map(subdir); size = limit - idx; if (size > ENTRIES_PER_PAGE) size = ENTRIES_PER_PAGE; offset = shmem_find_swp(entry, ptr, ptr+size); shmem_swp_unmap(ptr); if (offset >= 0) { shmem_dir_unmap(dir); goto found; } } } lost1: shmem_dir_unmap(dir-1); lost2: spin_unlock(&info->lock); return 0; found: idx += offset; inode = igrab(&info->vfs_inode); spin_unlock(&info->lock); /* * Move _head_ to start search for next from here. * But be careful: shmem_delete_inode checks list_empty without taking * mutex, and there's an instant in list_move_tail when info->swaplist * would appear empty, if it were the only one on shmem_swaplist. We * could avoid doing it if inode NULL; or use this minor optimization. */ if (shmem_swaplist.next != &info->swaplist) list_move_tail(&shmem_swaplist, &info->swaplist); mutex_unlock(&shmem_swaplist_mutex); error = 1; if (!inode) goto out; /* * Charge page using GFP_KERNEL while we can wait. * Charged back to the user(not to caller) when swap account is used. * add_to_page_cache() will be called with GFP_NOWAIT. */ error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); if (error) goto out; error = radix_tree_preload(GFP_KERNEL); if (error) { mem_cgroup_uncharge_cache_page(page); goto out; } error = 1; spin_lock(&info->lock); ptr = shmem_swp_entry(info, idx, NULL); if (ptr && ptr->val == entry.val) { error = add_to_page_cache_locked(page, inode->i_mapping, idx, GFP_NOWAIT); /* does mem_cgroup_uncharge_cache_page on error */ } else /* we must compensate for our precharge above */ mem_cgroup_uncharge_cache_page(page); if (error == -EEXIST) { struct page *filepage = find_get_page(inode->i_mapping, idx); error = 1; if (filepage) { /* * There might be a more uptodate page coming down * from a stacked writepage: forget our swappage if so. */ if (PageUptodate(filepage)) error = 0; page_cache_release(filepage); } } if (!error) { delete_from_swap_cache(page); set_page_dirty(page); info->flags |= SHMEM_PAGEIN; shmem_swp_set(info, ptr, 0); swap_free(entry); error = 1; /* not an error, but entry was found */ } if (ptr) shmem_swp_unmap(ptr); spin_unlock(&info->lock); radix_tree_preload_end(); out: unlock_page(page); page_cache_release(page); iput(inode); /* allows for NULL */ return error; } /* * shmem_unuse() search for an eventually swapped out shmem page. */ int shmem_unuse(swp_entry_t entry, struct page *page) { struct list_head *p, *next; struct shmem_inode_info *info; int found = 0; mutex_lock(&shmem_swaplist_mutex); list_for_each_safe(p, next, &shmem_swaplist) { info = list_entry(p, struct shmem_inode_info, swaplist); found = shmem_unuse_inode(info, entry, page); cond_resched(); if (found) goto out; } mutex_unlock(&shmem_swaplist_mutex); out: return found; /* 0 or 1 or -ENOMEM */ } /* * Move the page from the page cache to the swap cache. */ static int shmem_writepage(struct page *page, struct writeback_control *wbc) { struct shmem_inode_info *info; swp_entry_t *entry, swap; struct address_space *mapping; unsigned long index; struct inode *inode; BUG_ON(!PageLocked(page)); mapping = page->mapping; index = page->index; inode = mapping->host; info = SHMEM_I(inode); if (info->flags & VM_LOCKED) goto redirty; if (!total_swap_pages) goto redirty; /* * shmem_backing_dev_info's capabilities prevent regular writeback or * sync from ever calling shmem_writepage; but a stacking filesystem * may use the ->writepage of its underlying filesystem, in which case * tmpfs should write out to swap only in response to memory pressure, * and not for pdflush or sync. However, in those cases, we do still * want to check if there's a redundant swappage to be discarded. */ if (wbc->for_reclaim) swap = get_swap_page(); else swap.val = 0; spin_lock(&info->lock); if (index >= info->next_index) { BUG_ON(!(info->flags & SHMEM_TRUNCATE)); goto unlock; } entry = shmem_swp_entry(info, index, NULL); if (entry->val) { /* * The more uptodate page coming down from a stacked * writepage should replace our old swappage. */ free_swap_and_cache(*entry); shmem_swp_set(info, entry, 0); } shmem_recalc_inode(inode); if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { remove_from_page_cache(page); shmem_swp_set(info, entry, swap.val); shmem_swp_unmap(entry); if (list_empty(&info->swaplist)) inode = igrab(inode); else inode = NULL; spin_unlock(&info->lock); swap_duplicate(swap); BUG_ON(page_mapped(page)); page_cache_release(page); /* pagecache ref */ swap_writepage(page, wbc); if (inode) { mutex_lock(&shmem_swaplist_mutex); /* move instead of add in case we're racing */ list_move_tail(&info->swaplist, &shmem_swaplist); mutex_unlock(&shmem_swaplist_mutex); iput(inode); } return 0; } shmem_swp_unmap(entry); unlock: spin_unlock(&info->lock); swapcache_free(swap, NULL); redirty: set_page_dirty(page); if (wbc->for_reclaim) return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */ unlock_page(page); return 0; } #ifdef CONFIG_NUMA #ifdef CONFIG_TMPFS static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) { char buffer[64]; if (!mpol || mpol->mode == MPOL_DEFAULT) return; /* show nothing */ mpol_to_str(buffer, sizeof(buffer), mpol, 1); seq_printf(seq, ",mpol=%s", buffer); } static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) { struct mempolicy *mpol = NULL; if (sbinfo->mpol) { spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ mpol = sbinfo->mpol; mpol_get(mpol); spin_unlock(&sbinfo->stat_lock); } return mpol; } #endif /* CONFIG_TMPFS */ static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, struct shmem_inode_info *info, unsigned long idx) { struct mempolicy mpol, *spol; struct vm_area_struct pvma; struct page *page; spol = mpol_cond_copy(&mpol, mpol_shared_policy_lookup(&info->policy, idx)); /* Create a pseudo vma that just contains the policy */ pvma.vm_start = 0; pvma.vm_pgoff = idx; pvma.vm_ops = NULL; pvma.vm_policy = spol; page = swapin_readahead(entry, gfp, &pvma, 0); return page; } static struct page *shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info, unsigned long idx) { struct vm_area_struct pvma; /* Create a pseudo vma that just contains the policy */ pvma.vm_start = 0; pvma.vm_pgoff = idx; pvma.vm_ops = NULL; pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); /* * alloc_page_vma() will drop the shared policy reference */ return alloc_page_vma(gfp, &pvma, 0); } #else /* !CONFIG_NUMA */ #ifdef CONFIG_TMPFS static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p) { } #endif /* CONFIG_TMPFS */ static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, struct shmem_inode_info *info, unsigned long idx) { return swapin_readahead(entry, gfp, NULL, 0); } static inline struct page *shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info, unsigned long idx) { return alloc_page(gfp); } #endif /* CONFIG_NUMA */ #if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS) static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) { return NULL; } #endif /* * shmem_getpage - either get the page from swap or allocate a new one * * If we allocate a new one we do not mark it dirty. That's up to the * vm. If we swap it in we mark it dirty since we also free the swap * entry since a page cannot live in both the swap and page cache */ static int shmem_getpage(struct inode *inode, unsigned long idx, struct page **pagep, enum sgp_type sgp, int *type) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo; struct page *filepage = *pagep; struct page *swappage; swp_entry_t *entry; swp_entry_t swap; gfp_t gfp; int error; if (idx >= SHMEM_MAX_INDEX) return -EFBIG; if (type) *type = 0; /* * Normally, filepage is NULL on entry, and either found * uptodate immediately, or allocated and zeroed, or read * in under swappage, which is then assigned to filepage. * But shmem_readpage (required for splice) passes in a locked * filepage, which may be found not uptodate by other callers * too, and may need to be copied from the swappage read in. */ repeat: if (!filepage) filepage = find_lock_page(mapping, idx); if (filepage && PageUptodate(filepage)) goto done; error = 0; gfp = mapping_gfp_mask(mapping); if (!filepage) { /* * Try to preload while we can wait, to not make a habit of * draining atomic reserves; but don't latch on to this cpu. */ error = radix_tree_preload(gfp & ~__GFP_HIGHMEM); if (error) goto failed; radix_tree_preload_end(); } spin_lock(&info->lock); shmem_recalc_inode(inode); entry = shmem_swp_alloc(info, idx, sgp); if (IS_ERR(entry)) { spin_unlock(&info->lock); error = PTR_ERR(entry); goto failed; } swap = *entry; if (swap.val) { /* Look it up and read it in.. */ swappage = lookup_swap_cache(swap); if (!swappage) { shmem_swp_unmap(entry); /* here we actually do the io */ if (type && !(*type & VM_FAULT_MAJOR)) { __count_vm_event(PGMAJFAULT); *type |= VM_FAULT_MAJOR; } spin_unlock(&info->lock); swappage = shmem_swapin(swap, gfp, info, idx); if (!swappage) { spin_lock(&info->lock); entry = shmem_swp_alloc(info, idx, sgp); if (IS_ERR(entry)) error = PTR_ERR(entry); else { if (entry->val == swap.val) error = -ENOMEM; shmem_swp_unmap(entry); } spin_unlock(&info->lock); if (error) goto failed; goto repeat; } wait_on_page_locked(swappage); page_cache_release(swappage); goto repeat; } /* We have to do this with page locked to prevent races */ if (!trylock_page(swappage)) { shmem_swp_unmap(entry); spin_unlock(&info->lock); wait_on_page_locked(swappage); page_cache_release(swappage); goto repeat; } if (PageWriteback(swappage)) { shmem_swp_unmap(entry); spin_unlock(&info->lock); wait_on_page_writeback(swappage); unlock_page(swappage); page_cache_release(swappage); goto repeat; } if (!PageUptodate(swappage)) { shmem_swp_unmap(entry); spin_unlock(&info->lock); unlock_page(swappage); page_cache_release(swappage); error = -EIO; goto failed; } if (filepage) { shmem_swp_set(info, entry, 0); shmem_swp_unmap(entry); delete_from_swap_cache(swappage); spin_unlock(&info->lock); copy_highpage(filepage, swappage); unlock_page(swappage); page_cache_release(swappage); flush_dcache_page(filepage); SetPageUptodate(filepage); set_page_dirty(filepage); swap_free(swap); } else if (!(error = add_to_page_cache_locked(swappage, mapping, idx, GFP_NOWAIT))) { info->flags |= SHMEM_PAGEIN; shmem_swp_set(info, entry, 0); shmem_swp_unmap(entry); delete_from_swap_cache(swappage); spin_unlock(&info->lock); filepage = swappage; set_page_dirty(filepage); swap_free(swap); } else { shmem_swp_unmap(entry); spin_unlock(&info->lock); if (error == -ENOMEM) { /* * reclaim from proper memory cgroup and * call memcg's OOM if needed. */ error = mem_cgroup_shmem_charge_fallback( swappage, current->mm, gfp); if (error) { unlock_page(swappage); page_cache_release(swappage); goto failed; } } unlock_page(swappage); page_cache_release(swappage); goto repeat; } } else if (sgp == SGP_READ && !filepage) { shmem_swp_unmap(entry); filepage = find_get_page(mapping, idx); if (filepage && (!PageUptodate(filepage) || !trylock_page(filepage))) { spin_unlock(&info->lock); wait_on_page_locked(filepage); page_cache_release(filepage); filepage = NULL; goto repeat; } spin_unlock(&info->lock); } else { shmem_swp_unmap(entry); sbinfo = SHMEM_SB(inode->i_sb); if (sbinfo->max_blocks) { spin_lock(&sbinfo->stat_lock); if (sbinfo->free_blocks == 0 || shmem_acct_block(info->flags)) { spin_unlock(&sbinfo->stat_lock); spin_unlock(&info->lock); error = -ENOSPC; goto failed; } sbinfo->free_blocks--; inode->i_blocks += BLOCKS_PER_PAGE; spin_unlock(&sbinfo->stat_lock); } else if (shmem_acct_block(info->flags)) { spin_unlock(&info->lock); error = -ENOSPC; goto failed; } if (!filepage) { int ret; spin_unlock(&info->lock); filepage = shmem_alloc_page(gfp, info, idx); if (!filepage) { shmem_unacct_blocks(info->flags, 1); shmem_free_blocks(inode, 1); error = -ENOMEM; goto failed; } SetPageSwapBacked(filepage); /* Precharge page while we can wait, compensate after */ error = mem_cgroup_cache_charge(filepage, current->mm, GFP_KERNEL); if (error) { page_cache_release(filepage); shmem_unacct_blocks(info->flags, 1); shmem_free_blocks(inode, 1); filepage = NULL; goto failed; } spin_lock(&info->lock); entry = shmem_swp_alloc(info, idx, sgp); if (IS_ERR(entry)) error = PTR_ERR(entry); else { swap = *entry; shmem_swp_unmap(entry); } ret = error || swap.val; if (ret) mem_cgroup_uncharge_cache_page(filepage); else ret = add_to_page_cache_lru(filepage, mapping, idx, GFP_NOWAIT); /* * At add_to_page_cache_lru() failure, uncharge will * be done automatically. */ if (ret) { spin_unlock(&info->lock); page_cache_release(filepage); shmem_unacct_blocks(info->flags, 1); shmem_free_blocks(inode, 1); filepage = NULL; if (error) goto failed; goto repeat; } info->flags |= SHMEM_PAGEIN; } info->alloced++; spin_unlock(&info->lock); clear_highpage(filepage); flush_dcache_page(filepage); SetPageUptodate(filepage); if (sgp == SGP_DIRTY) set_page_dirty(filepage); } done: *pagep = filepage; return 0; failed: if (*pagep != filepage) { unlock_page(filepage); page_cache_release(filepage); } return error; } static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct inode *inode = vma->vm_file->f_path.dentry->d_inode; int error; int ret; if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) return VM_FAULT_SIGBUS; error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); if (error) return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); return ret | VM_FAULT_LOCKED; } #ifdef CONFIG_NUMA static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) { struct inode *i = vma->vm_file->f_path.dentry->d_inode; return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); } static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, unsigned long addr) { struct inode *i = vma->vm_file->f_path.dentry->d_inode; unsigned long idx; idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx); } #endif int shmem_lock(struct file *file, int lock, struct user_struct *user) { struct inode *inode = file->f_path.dentry->d_inode; struct shmem_inode_info *info = SHMEM_I(inode); int retval = -ENOMEM; spin_lock(&info->lock); if (lock && !(info->flags & VM_LOCKED)) { if (!user_shm_lock(inode->i_size, user)) goto out_nomem; info->flags |= VM_LOCKED; mapping_set_unevictable(file->f_mapping); } if (!lock && (info->flags & VM_LOCKED) && user) { user_shm_unlock(inode->i_size, user); info->flags &= ~VM_LOCKED; mapping_clear_unevictable(file->f_mapping); scan_mapping_unevictable_pages(file->f_mapping); } retval = 0; out_nomem: spin_unlock(&info->lock); return retval; } static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); vma->vm_ops = &shmem_vm_ops; vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } static struct inode *shmem_get_inode(struct super_block *sb, int mode, dev_t dev, unsigned long flags) { struct inode *inode; struct shmem_inode_info *info; struct shmem_sb_info *sbinfo = SHMEM_SB(sb); if (shmem_reserve_inode(sb)) return NULL; inode = new_inode(sb); if (inode) { inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_blocks = 0; inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_generation = get_seconds(); info = SHMEM_I(inode); memset(info, 0, (char *)inode - (char *)info); spin_lock_init(&info->lock); info->flags = flags & VM_NORESERVE; INIT_LIST_HEAD(&info->swaplist); cache_no_acl(inode); switch (mode & S_IFMT) { default: inode->i_op = &shmem_special_inode_operations; init_special_inode(inode, mode, dev); break; case S_IFREG: inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_inode_operations; inode->i_fop = &shmem_file_operations; mpol_shared_policy_init(&info->policy, shmem_get_sbmpol(sbinfo)); break; case S_IFDIR: inc_nlink(inode); /* Some things misbehave if size == 0 on a directory */ inode->i_size = 2 * BOGO_DIRENT_SIZE; inode->i_op = &shmem_dir_inode_operations; inode->i_fop = &simple_dir_operations; break; case S_IFLNK: /* * Must not load anything in the rbtree, * mpol_free_shared_policy will not be called. */ mpol_shared_policy_init(&info->policy, NULL); break; } } else shmem_free_inode(sb); return inode; } #ifdef CONFIG_TMPFS static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_symlink_inline_operations; /* * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin; * but providing them allows a tmpfs file to be used for splice, sendfile, and * below the loop driver, in the generic fashion that many filesystems support. */ static int shmem_readpage(struct file *file, struct page *page) { struct inode *inode = page->mapping->host; int error = shmem_getpage(inode, page->index, &page, SGP_CACHE, NULL); unlock_page(page); return error; } static int shmem_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { struct inode *inode = mapping->host; pgoff_t index = pos >> PAGE_CACHE_SHIFT; *pagep = NULL; return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); } static int shmem_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { struct inode *inode = mapping->host; if (pos + copied > inode->i_size) i_size_write(inode, pos + copied); unlock_page(page); set_page_dirty(page); page_cache_release(page); return copied; } static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) { struct inode *inode = filp->f_path.dentry->d_inode; struct address_space *mapping = inode->i_mapping; unsigned long index, offset; enum sgp_type sgp = SGP_READ; /* * Might this read be for a stacking filesystem? Then when reading * holes of a sparse file, we actually need to allocate those pages, * and even mark them dirty, so it cannot exceed the max_blocks limit. */ if (segment_eq(get_fs(), KERNEL_DS)) sgp = SGP_DIRTY; index = *ppos >> PAGE_CACHE_SHIFT; offset = *ppos & ~PAGE_CACHE_MASK; for (;;) { struct page *page = NULL; unsigned long end_index, nr, ret; loff_t i_size = i_size_read(inode); end_index = i_size >> PAGE_CACHE_SHIFT; if (index > end_index) break; if (index == end_index) { nr = i_size & ~PAGE_CACHE_MASK; if (nr <= offset) break; } desc->error = shmem_getpage(inode, index, &page, sgp, NULL); if (desc->error) { if (desc->error == -EINVAL) desc->error = 0; break; } if (page) unlock_page(page); /* * We must evaluate after, since reads (unlike writes) * are called without i_mutex protection against truncate */ nr = PAGE_CACHE_SIZE; i_size = i_size_read(inode); end_index = i_size >> PAGE_CACHE_SHIFT; if (index == end_index) { nr = i_size & ~PAGE_CACHE_MASK; if (nr <= offset) { if (page) page_cache_release(page); break; } } nr -= offset; if (page) { /* * If users can be writing to this page using arbitrary * virtual addresses, take care about potential aliasing * before reading the page on the kernel side. */ if (mapping_writably_mapped(mapping)) flush_dcache_page(page); /* * Mark the page accessed if we read the beginning. */ if (!offset) mark_page_accessed(page); } else { page = ZERO_PAGE(0); page_cache_get(page); } /* * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... * * The actor routine returns how many bytes were actually used.. * NOTE! This may not be the same as how much of a user buffer * we filled up (we may be padding etc), so we can only update * "pos" here (the actor routine has to update the user buffer * pointers and the remaining count). */ ret = actor(desc, page, offset, nr); offset += ret; index += offset >> PAGE_CACHE_SHIFT; offset &= ~PAGE_CACHE_MASK; page_cache_release(page); if (ret != nr || !desc->count) break; cond_resched(); } *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; file_accessed(filp); } static ssize_t shmem_file_aio_read(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct file *filp = iocb->ki_filp; ssize_t retval; unsigned long seg; size_t count; loff_t *ppos = &iocb->ki_pos; retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); if (retval) return retval; for (seg = 0; seg < nr_segs; seg++) { read_descriptor_t desc; desc.written = 0; desc.arg.buf = iov[seg].iov_base; desc.count = iov[seg].iov_len; if (desc.count == 0) continue; desc.error = 0; do_shmem_file_read(filp, ppos, &desc, file_read_actor); retval += desc.written; if (desc.error) { retval = retval ?: desc.error; break; } if (desc.count > 0) break; } return retval; } static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) { struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); buf->f_type = TMPFS_MAGIC; buf->f_bsize = PAGE_CACHE_SIZE; buf->f_namelen = NAME_MAX; spin_lock(&sbinfo->stat_lock); if (sbinfo->max_blocks) { buf->f_blocks = sbinfo->max_blocks; buf->f_bavail = buf->f_bfree = sbinfo->free_blocks; } if (sbinfo->max_inodes) { buf->f_files = sbinfo->max_inodes; buf->f_ffree = sbinfo->free_inodes; } /* else leave those fields 0 like simple_statfs */ spin_unlock(&sbinfo->stat_lock); return 0; } /* * File creation. Allocate an inode, and we're done.. */ static int shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) { struct inode *inode; int error = -ENOSPC; inode = shmem_get_inode(dir->i_sb, mode, dev, VM_NORESERVE); if (inode) { error = security_inode_init_security(inode, dir, NULL, NULL, NULL); if (error) { if (error != -EOPNOTSUPP) { iput(inode); return error; } } error = shmem_acl_init(inode, dir); if (error) { iput(inode); return error; } if (dir->i_mode & S_ISGID) { inode->i_gid = dir->i_gid; if (S_ISDIR(mode)) inode->i_mode |= S_ISGID; } dir->i_size += BOGO_DIRENT_SIZE; dir->i_ctime = dir->i_mtime = CURRENT_TIME; d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ } return error; } static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode) { int error; if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0))) return error; inc_nlink(dir); return 0; } static int shmem_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd) { return shmem_mknod(dir, dentry, mode | S_IFREG, 0); } /* * Link a file.. */ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = old_dentry->d_inode; int ret; /* * No ordinary (disk based) filesystem counts links as inodes; * but each new link needs a new dentry, pinning lowmem, and * tmpfs dentries cannot be pruned until they are unlinked. */ ret = shmem_reserve_inode(inode->i_sb); if (ret) goto out; dir->i_size += BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; inc_nlink(inode); atomic_inc(&inode->i_count); /* New dentry reference */ dget(dentry); /* Extra pinning count for the created dentry */ d_instantiate(dentry, inode); out: return ret; } static int shmem_unlink(struct inode *dir, struct dentry *dentry) { struct inode *inode = dentry->d_inode; if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) shmem_free_inode(inode->i_sb); dir->i_size -= BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; drop_nlink(inode); dput(dentry); /* Undo the count from "create" - this does all the work */ return 0; } static int shmem_rmdir(struct inode *dir, struct dentry *dentry) { if (!simple_empty(dentry)) return -ENOTEMPTY; drop_nlink(dentry->d_inode); drop_nlink(dir); return shmem_unlink(dir, dentry); } /* * The VFS layer already does all the dentry stuff for rename, * we just have to decrement the usage count for the target if * it exists so that the VFS layer correctly free's it when it * gets overwritten. */ static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct inode *inode = old_dentry->d_inode; int they_are_dirs = S_ISDIR(inode->i_mode); if (!simple_empty(new_dentry)) return -ENOTEMPTY; if (new_dentry->d_inode) { (void) shmem_unlink(new_dir, new_dentry); if (they_are_dirs) drop_nlink(old_dir); } else if (they_are_dirs) { drop_nlink(old_dir); inc_nlink(new_dir); } old_dir->i_size -= BOGO_DIRENT_SIZE; new_dir->i_size += BOGO_DIRENT_SIZE; old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime = new_dir->i_mtime = inode->i_ctime = CURRENT_TIME; return 0; } static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { int error; int len; struct inode *inode; struct page *page = NULL; char *kaddr; struct shmem_inode_info *info; len = strlen(symname) + 1; if (len > PAGE_CACHE_SIZE) return -ENAMETOOLONG; inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE); if (!inode) return -ENOSPC; error = security_inode_init_security(inode, dir, NULL, NULL, NULL); if (error) { if (error != -EOPNOTSUPP) { iput(inode); return error; } error = 0; } info = SHMEM_I(inode); inode->i_size = len-1; if (len <= (char *)inode - (char *)info) { /* do it inline */ memcpy(info, symname, len); inode->i_op = &shmem_symlink_inline_operations; } else { error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); if (error) { iput(inode); return error; } unlock_page(page); inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_symlink_inode_operations; kaddr = kmap_atomic(page, KM_USER0); memcpy(kaddr, symname, len); kunmap_atomic(kaddr, KM_USER0); set_page_dirty(page); page_cache_release(page); } if (dir->i_mode & S_ISGID) inode->i_gid = dir->i_gid; dir->i_size += BOGO_DIRENT_SIZE; dir->i_ctime = dir->i_mtime = CURRENT_TIME; d_instantiate(dentry, inode); dget(dentry); return 0; } static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd) { nd_set_link(nd, (char *)SHMEM_I(dentry->d_inode)); return NULL; } static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) { struct page *page = NULL; int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); nd_set_link(nd, res ? ERR_PTR(res) : kmap(page)); if (page) unlock_page(page); return page; } static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) { if (!IS_ERR(nd_get_link(nd))) { struct page *page = cookie; kunmap(page); mark_page_accessed(page); page_cache_release(page); } } static const struct inode_operations shmem_symlink_inline_operations = { .readlink = generic_readlink, .follow_link = shmem_follow_link_inline, }; static const struct inode_operations shmem_symlink_inode_operations = { .truncate = shmem_truncate, .readlink = generic_readlink, .follow_link = shmem_follow_link, .put_link = shmem_put_link, }; #ifdef CONFIG_TMPFS_POSIX_ACL /* * Superblocks without xattr inode operations will get security.* xattr * support from the VFS "for free". As soon as we have any other xattrs * like ACLs, we also need to implement the security.* handlers at * filesystem level, though. */ static size_t shmem_xattr_security_list(struct inode *inode, char *list, size_t list_len, const char *name, size_t name_len) { return security_inode_listsecurity(inode, list, list_len); } static int shmem_xattr_security_get(struct inode *inode, const char *name, void *buffer, size_t size) { if (strcmp(name, "") == 0) return -EINVAL; return xattr_getsecurity(inode, name, buffer, size); } static int shmem_xattr_security_set(struct inode *inode, const char *name, const void *value, size_t size, int flags) { if (strcmp(name, "") == 0) return -EINVAL; return security_inode_setsecurity(inode, name, value, size, flags); } static struct xattr_handler shmem_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = shmem_xattr_security_list, .get = shmem_xattr_security_get, .set = shmem_xattr_security_set, }; static struct xattr_handler *shmem_xattr_handlers[] = { &shmem_xattr_acl_access_handler, &shmem_xattr_acl_default_handler, &shmem_xattr_security_handler, NULL }; #endif static struct dentry *shmem_get_parent(struct dentry *child) { return ERR_PTR(-ESTALE); } static int shmem_match(struct inode *ino, void *vfh) { __u32 *fh = vfh; __u64 inum = fh[2]; inum = (inum << 32) | fh[1]; return ino->i_ino == inum && fh[0] == ino->i_generation; } static struct dentry *shmem_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { struct inode *inode; struct dentry *dentry = NULL; u64 inum = fid->raw[2]; inum = (inum << 32) | fid->raw[1]; if (fh_len < 3) return NULL; inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), shmem_match, fid->raw); if (inode) { dentry = d_find_alias(inode); iput(inode); } return dentry; } static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len, int connectable) { struct inode *inode = dentry->d_inode; if (*len < 3) return 255; if (hlist_unhashed(&inode->i_hash)) { /* Unfortunately insert_inode_hash is not idempotent, * so as we hash inodes here rather than at creation * time, we need a lock to ensure we only try * to do it once */ static DEFINE_SPINLOCK(lock); spin_lock(&lock); if (hlist_unhashed(&inode->i_hash)) __insert_inode_hash(inode, inode->i_ino + inode->i_generation); spin_unlock(&lock); } fh[0] = inode->i_generation; fh[1] = inode->i_ino; fh[2] = ((__u64)inode->i_ino) >> 32; *len = 3; return 1; } static const struct export_operations shmem_export_ops = { .get_parent = shmem_get_parent, .encode_fh = shmem_encode_fh, .fh_to_dentry = shmem_fh_to_dentry, }; static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, bool remount) { char *this_char, *value, *rest; while (options != NULL) { this_char = options; for (;;) { /* * NUL-terminate this option: unfortunately, * mount options form a comma-separated list, * but mpol's nodelist may also contain commas. */ options = strchr(options, ','); if (options == NULL) break; options++; if (!isdigit(*options)) { options[-1] = '\0'; break; } } if (!*this_char) continue; if ((value = strchr(this_char,'=')) != NULL) { *value++ = 0; } else { printk(KERN_ERR "tmpfs: No value for mount option '%s'\n", this_char); return 1; } if (!strcmp(this_char,"size")) { unsigned long long size; size = memparse(value,&rest); if (*rest == '%') { size <<= PAGE_SHIFT; size *= totalram_pages; do_div(size, 100); rest++; } if (*rest) goto bad_val; sbinfo->max_blocks = DIV_ROUND_UP(size, PAGE_CACHE_SIZE); } else if (!strcmp(this_char,"nr_blocks")) { sbinfo->max_blocks = memparse(value, &rest); if (*rest) goto bad_val; } else if (!strcmp(this_char,"nr_inodes")) { sbinfo->max_inodes = memparse(value, &rest); if (*rest) goto bad_val; } else if (!strcmp(this_char,"mode")) { if (remount) continue; sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777; if (*rest) goto bad_val; } else if (!strcmp(this_char,"uid")) { if (remount) continue; sbinfo->uid = simple_strtoul(value, &rest, 0); if (*rest) goto bad_val; } else if (!strcmp(this_char,"gid")) { if (remount) continue; sbinfo->gid = simple_strtoul(value, &rest, 0); if (*rest) goto bad_val; } else if (!strcmp(this_char,"mpol")) { if (mpol_parse_str(value, &sbinfo->mpol, 1)) goto bad_val; } else { printk(KERN_ERR "tmpfs: Bad mount option %s\n", this_char); return 1; } } return 0; bad_val: printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n", value, this_char); return 1; } static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); struct shmem_sb_info config = *sbinfo; unsigned long blocks; unsigned long inodes; int error = -EINVAL; if (shmem_parse_options(data, &config, true)) return error; spin_lock(&sbinfo->stat_lock); blocks = sbinfo->max_blocks - sbinfo->free_blocks; inodes = sbinfo->max_inodes - sbinfo->free_inodes; if (config.max_blocks < blocks) goto out; if (config.max_inodes < inodes) goto out; /* * Those tests also disallow limited->unlimited while any are in * use, so i_blocks will always be zero when max_blocks is zero; * but we must separately disallow unlimited->limited, because * in that case we have no record of how much is already in use. */ if (config.max_blocks && !sbinfo->max_blocks) goto out; if (config.max_inodes && !sbinfo->max_inodes) goto out; error = 0; sbinfo->max_blocks = config.max_blocks; sbinfo->free_blocks = config.max_blocks - blocks; sbinfo->max_inodes = config.max_inodes; sbinfo->free_inodes = config.max_inodes - inodes; mpol_put(sbinfo->mpol); sbinfo->mpol = config.mpol; /* transfers initial ref */ out: spin_unlock(&sbinfo->stat_lock); return error; } static int shmem_show_options(struct seq_file *seq, struct vfsmount *vfs) { struct shmem_sb_info *sbinfo = SHMEM_SB(vfs->mnt_sb); if (sbinfo->max_blocks != shmem_default_max_blocks()) seq_printf(seq, ",size=%luk", sbinfo->max_blocks << (PAGE_CACHE_SHIFT - 10)); if (sbinfo->max_inodes != shmem_default_max_inodes()) seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); if (sbinfo->mode != (S_IRWXUGO | S_ISVTX)) seq_printf(seq, ",mode=%03o", sbinfo->mode); if (sbinfo->uid != 0) seq_printf(seq, ",uid=%u", sbinfo->uid); if (sbinfo->gid != 0) seq_printf(seq, ",gid=%u", sbinfo->gid); shmem_show_mpol(seq, sbinfo->mpol); return 0; } #endif /* CONFIG_TMPFS */ static void shmem_put_super(struct super_block *sb) { kfree(sb->s_fs_info); sb->s_fs_info = NULL; } static int shmem_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; struct dentry *root; struct shmem_sb_info *sbinfo; int err = -ENOMEM; /* Round up to L1_CACHE_BYTES to resist false sharing */ sbinfo = kmalloc(max((int)sizeof(struct shmem_sb_info), L1_CACHE_BYTES), GFP_KERNEL); if (!sbinfo) return -ENOMEM; sbinfo->max_blocks = 0; sbinfo->max_inodes = 0; sbinfo->mode = S_IRWXUGO | S_ISVTX; sbinfo->uid = current_fsuid(); sbinfo->gid = current_fsgid(); sbinfo->mpol = NULL; sb->s_fs_info = sbinfo; #ifdef CONFIG_TMPFS /* * Per default we only allow half of the physical ram per * tmpfs instance, limiting inodes to one per page of lowmem; * but the internal instance is left unlimited. */ if (!(sb->s_flags & MS_NOUSER)) { sbinfo->max_blocks = shmem_default_max_blocks(); sbinfo->max_inodes = shmem_default_max_inodes(); if (shmem_parse_options(data, sbinfo, false)) { err = -EINVAL; goto failed; } } sb->s_export_op = &shmem_export_ops; #else sb->s_flags |= MS_NOUSER; #endif spin_lock_init(&sbinfo->stat_lock); sbinfo->free_blocks = sbinfo->max_blocks; sbinfo->free_inodes = sbinfo->max_inodes; sb->s_maxbytes = SHMEM_MAX_BYTES; sb->s_blocksize = PAGE_CACHE_SIZE; sb->s_blocksize_bits = PAGE_CACHE_SHIFT; sb->s_magic = TMPFS_MAGIC; sb->s_op = &shmem_ops; sb->s_time_gran = 1; #ifdef CONFIG_TMPFS_POSIX_ACL sb->s_xattr = shmem_xattr_handlers; sb->s_flags |= MS_POSIXACL; #endif inode = shmem_get_inode(sb, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); if (!inode) goto failed; inode->i_uid = sbinfo->uid; inode->i_gid = sbinfo->gid; root = d_alloc_root(inode); if (!root) goto failed_iput; sb->s_root = root; return 0; failed_iput: iput(inode); failed: shmem_put_super(sb); return err; } static struct kmem_cache *shmem_inode_cachep; static struct inode *shmem_alloc_inode(struct super_block *sb) { struct shmem_inode_info *p; p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); if (!p) return NULL; return &p->vfs_inode; } static void shmem_destroy_inode(struct inode *inode) { if ((inode->i_mode & S_IFMT) == S_IFREG) { /* only struct inode is valid if it's an inline symlink */ mpol_free_shared_policy(&SHMEM_I(inode)->policy); } kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); } static void init_once(void *foo) { struct shmem_inode_info *p = (struct shmem_inode_info *) foo; inode_init_once(&p->vfs_inode); } static int init_inodecache(void) { shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", sizeof(struct shmem_inode_info), 0, SLAB_PANIC, init_once); return 0; } static void destroy_inodecache(void) { kmem_cache_destroy(shmem_inode_cachep); } static const struct address_space_operations shmem_aops = { .writepage = shmem_writepage, .set_page_dirty = __set_page_dirty_no_writeback, #ifdef CONFIG_TMPFS .readpage = shmem_readpage, .write_begin = shmem_write_begin, .write_end = shmem_write_end, #endif .migratepage = migrate_page, }; static const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, #ifdef CONFIG_TMPFS .llseek = generic_file_llseek, .read = do_sync_read, .write = do_sync_write, .aio_read = shmem_file_aio_read, .aio_write = generic_file_aio_write, .fsync = simple_sync_file, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, #endif }; static const struct inode_operations shmem_inode_operations = { .truncate = shmem_truncate, .setattr = shmem_notify_change, .truncate_range = shmem_truncate_range, #ifdef CONFIG_TMPFS_POSIX_ACL .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = generic_listxattr, .removexattr = generic_removexattr, .permission = shmem_permission, #endif }; static const struct inode_operations shmem_dir_inode_operations = { #ifdef CONFIG_TMPFS .create = shmem_create, .lookup = simple_lookup, .link = shmem_link, .unlink = shmem_unlink, .symlink = shmem_symlink, .mkdir = shmem_mkdir, .rmdir = shmem_rmdir, .mknod = shmem_mknod, .rename = shmem_rename, #endif #ifdef CONFIG_TMPFS_POSIX_ACL .setattr = shmem_notify_change, .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = generic_listxattr, .removexattr = generic_removexattr, .permission = shmem_permission, #endif }; static const struct inode_operations shmem_special_inode_operations = { #ifdef CONFIG_TMPFS_POSIX_ACL .setattr = shmem_notify_change, .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = generic_listxattr, .removexattr = generic_removexattr, .permission = shmem_permission, #endif }; static const struct super_operations shmem_ops = { .alloc_inode = shmem_alloc_inode, .destroy_inode = shmem_destroy_inode, #ifdef CONFIG_TMPFS .statfs = shmem_statfs, .remount_fs = shmem_remount_fs, .show_options = shmem_show_options, #endif .delete_inode = shmem_delete_inode, .drop_inode = generic_delete_inode, .put_super = shmem_put_super, }; static struct vm_operations_struct shmem_vm_ops = { .fault = shmem_fault, #ifdef CONFIG_NUMA .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, #endif }; static int shmem_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, struct vfsmount *mnt) { return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt); } static struct file_system_type tmpfs_fs_type = { .owner = THIS_MODULE, .name = "tmpfs", .get_sb = shmem_get_sb, .kill_sb = kill_litter_super, }; static int __init init_tmpfs(void) { int error; error = bdi_init(&shmem_backing_dev_info); if (error) goto out4; error = init_inodecache(); if (error) goto out3; error = register_filesystem(&tmpfs_fs_type); if (error) { printk(KERN_ERR "Could not register tmpfs\n"); goto out2; } shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER, tmpfs_fs_type.name, NULL); if (IS_ERR(shm_mnt)) { error = PTR_ERR(shm_mnt); printk(KERN_ERR "Could not kern_mount tmpfs\n"); goto out1; } return 0; out1: unregister_filesystem(&tmpfs_fs_type); out2: destroy_inodecache(); out3: bdi_destroy(&shmem_backing_dev_info); out4: shm_mnt = ERR_PTR(error); return error; } #else /* !CONFIG_SHMEM */ /* * tiny-shmem: simple shmemfs and tmpfs using ramfs code * * This is intended for small system where the benefits of the full * shmem code (swap-backed and resource-limited) are outweighed by * their complexity. On systems without swap this code should be * effectively equivalent, but much lighter weight. */ #include <linux/ramfs.h> static struct file_system_type tmpfs_fs_type = { .name = "tmpfs", .get_sb = ramfs_get_sb, .kill_sb = kill_litter_super, }; static int __init init_tmpfs(void) { BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); shm_mnt = kern_mount(&tmpfs_fs_type); BUG_ON(IS_ERR(shm_mnt)); return 0; } int shmem_unuse(swp_entry_t entry, struct page *page) { return 0; } #define shmem_vm_ops generic_file_vm_ops #define shmem_file_operations ramfs_file_operations #define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev) #define shmem_acct_size(flags, size) 0 #define shmem_unacct_size(flags, size) do {} while (0) #define SHMEM_MAX_BYTES MAX_LFS_FILESIZE #endif /* CONFIG_SHMEM */ /* common code */ /** * shmem_file_setup - get an unlinked file living in tmpfs * @name: name for dentry (to be seen in /proc/<pid>/maps * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) { int error; struct file *file; struct inode *inode; struct dentry *dentry, *root; struct qstr this; if (IS_ERR(shm_mnt)) return (void *)shm_mnt; if (size < 0 || size > SHMEM_MAX_BYTES) return ERR_PTR(-EINVAL); if (shmem_acct_size(flags, size)) return ERR_PTR(-ENOMEM); error = -ENOMEM; this.name = name; this.len = strlen(name); this.hash = 0; /* will go */ root = shm_mnt->mnt_root; dentry = d_alloc(root, &this); if (!dentry) goto put_memory; error = -ENFILE; file = get_empty_filp(); if (!file) goto put_dentry; error = -ENOSPC; inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags); if (!inode) goto close_file; d_instantiate(dentry, inode); inode->i_size = size; inode->i_nlink = 0; /* It is unlinked */ init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, &shmem_file_operations); #ifndef CONFIG_MMU error = ramfs_nommu_expand_for_mapping(inode, size); if (error) goto close_file; #endif ima_counts_get(file); return file; close_file: put_filp(file); put_dentry: dput(dentry); put_memory: shmem_unacct_size(flags, size); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(shmem_file_setup); /** * shmem_zero_setup - setup a shared anonymous mapping * @vma: the vma to be mmapped is prepared by do_mmap_pgoff */ int shmem_zero_setup(struct vm_area_struct *vma) { struct file *file; loff_t size = vma->vm_end - vma->vm_start; file = shmem_file_setup("dev/zero", size, vma->vm_flags); if (IS_ERR(file)) return PTR_ERR(file); if (vma->vm_file) fput(vma->vm_file); vma->vm_file = file; vma->vm_ops = &shmem_vm_ops; return 0; } module_init(init_tmpfs)