diff options
-rw-r--r-- | fs/xfs/Makefile-linux-2.6 | 2 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_globals.c | 1 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_linux.h | 1 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_sysctl.c | 11 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_sysctl.h | 2 | ||||
-rw-r--r-- | fs/xfs/xfs.h | 1 | ||||
-rw-r--r-- | fs/xfs/xfs_ag.h | 1 | ||||
-rw-r--r-- | fs/xfs/xfs_bmap.c | 69 | ||||
-rw-r--r-- | fs/xfs/xfs_clnt.h | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_dinode.h | 4 | ||||
-rw-r--r-- | fs/xfs/xfs_filestream.c | 771 | ||||
-rw-r--r-- | fs/xfs/xfs_filestream.h | 136 | ||||
-rw-r--r-- | fs/xfs/xfs_fs.h | 1 | ||||
-rw-r--r-- | fs/xfs/xfs_fsops.c | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_inode.c | 17 | ||||
-rw-r--r-- | fs/xfs/xfs_inode.h | 1 | ||||
-rw-r--r-- | fs/xfs/xfs_mount.h | 4 | ||||
-rw-r--r-- | fs/xfs/xfs_mru_cache.c | 608 | ||||
-rw-r--r-- | fs/xfs/xfs_mru_cache.h | 57 | ||||
-rw-r--r-- | fs/xfs/xfs_vfsops.c | 26 | ||||
-rw-r--r-- | fs/xfs/xfs_vnodeops.c | 25 |
21 files changed, 1730 insertions, 12 deletions
diff --git a/fs/xfs/Makefile-linux-2.6 b/fs/xfs/Makefile-linux-2.6 index b49989bb89ad..e7a9a83f0087 100644 --- a/fs/xfs/Makefile-linux-2.6 +++ b/fs/xfs/Makefile-linux-2.6 | |||
@@ -64,6 +64,7 @@ xfs-y += xfs_alloc.o \ | |||
64 | xfs_dir2_sf.o \ | 64 | xfs_dir2_sf.o \ |
65 | xfs_error.o \ | 65 | xfs_error.o \ |
66 | xfs_extfree_item.o \ | 66 | xfs_extfree_item.o \ |
67 | xfs_filestream.o \ | ||
67 | xfs_fsops.o \ | 68 | xfs_fsops.o \ |
68 | xfs_ialloc.o \ | 69 | xfs_ialloc.o \ |
69 | xfs_ialloc_btree.o \ | 70 | xfs_ialloc_btree.o \ |
@@ -77,6 +78,7 @@ xfs-y += xfs_alloc.o \ | |||
77 | xfs_log.o \ | 78 | xfs_log.o \ |
78 | xfs_log_recover.o \ | 79 | xfs_log_recover.o \ |
79 | xfs_mount.o \ | 80 | xfs_mount.o \ |
81 | xfs_mru_cache.o \ | ||
80 | xfs_rename.o \ | 82 | xfs_rename.o \ |
81 | xfs_trans.o \ | 83 | xfs_trans.o \ |
82 | xfs_trans_ail.o \ | 84 | xfs_trans_ail.o \ |
diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c index ed3a5e1b4b67..bb72c3d4141f 100644 --- a/fs/xfs/linux-2.6/xfs_globals.c +++ b/fs/xfs/linux-2.6/xfs_globals.c | |||
@@ -46,6 +46,7 @@ xfs_param_t xfs_params = { | |||
46 | .inherit_nosym = { 0, 0, 1 }, | 46 | .inherit_nosym = { 0, 0, 1 }, |
47 | .rotorstep = { 1, 1, 255 }, | 47 | .rotorstep = { 1, 1, 255 }, |
48 | .inherit_nodfrg = { 0, 1, 1 }, | 48 | .inherit_nodfrg = { 0, 1, 1 }, |
49 | .fstrm_timer = { 1, 50, 3600*100}, | ||
49 | }; | 50 | }; |
50 | 51 | ||
51 | /* | 52 | /* |
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index af24a457d3a3..330c4ba9d404 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h | |||
@@ -123,6 +123,7 @@ | |||
123 | #define xfs_inherit_nosymlinks xfs_params.inherit_nosym.val | 123 | #define xfs_inherit_nosymlinks xfs_params.inherit_nosym.val |
124 | #define xfs_rotorstep xfs_params.rotorstep.val | 124 | #define xfs_rotorstep xfs_params.rotorstep.val |
125 | #define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val | 125 | #define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val |
126 | #define xfs_fstrm_centisecs xfs_params.fstrm_timer.val | ||
126 | 127 | ||
127 | #define current_cpu() (raw_smp_processor_id()) | 128 | #define current_cpu() (raw_smp_processor_id()) |
128 | #define current_pid() (current->pid) | 129 | #define current_pid() (current->pid) |
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c index cd6eaa44aa2b..bb997d75c05c 100644 --- a/fs/xfs/linux-2.6/xfs_sysctl.c +++ b/fs/xfs/linux-2.6/xfs_sysctl.c | |||
@@ -210,6 +210,17 @@ static ctl_table xfs_table[] = { | |||
210 | .extra1 = &xfs_params.inherit_nodfrg.min, | 210 | .extra1 = &xfs_params.inherit_nodfrg.min, |
211 | .extra2 = &xfs_params.inherit_nodfrg.max | 211 | .extra2 = &xfs_params.inherit_nodfrg.max |
212 | }, | 212 | }, |
213 | { | ||
214 | .ctl_name = XFS_FILESTREAM_TIMER, | ||
215 | .procname = "filestream_centisecs", | ||
216 | .data = &xfs_params.fstrm_timer.val, | ||
217 | .maxlen = sizeof(int), | ||
218 | .mode = 0644, | ||
219 | .proc_handler = &proc_dointvec_minmax, | ||
220 | .strategy = &sysctl_intvec, | ||
221 | .extra1 = &xfs_params.fstrm_timer.min, | ||
222 | .extra2 = &xfs_params.fstrm_timer.max, | ||
223 | }, | ||
213 | /* please keep this the last entry */ | 224 | /* please keep this the last entry */ |
214 | #ifdef CONFIG_PROC_FS | 225 | #ifdef CONFIG_PROC_FS |
215 | { | 226 | { |
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.h b/fs/xfs/linux-2.6/xfs_sysctl.h index a631fb8cc5ac..98b97e399d6f 100644 --- a/fs/xfs/linux-2.6/xfs_sysctl.h +++ b/fs/xfs/linux-2.6/xfs_sysctl.h | |||
@@ -47,6 +47,7 @@ typedef struct xfs_param { | |||
47 | xfs_sysctl_val_t inherit_nosym; /* Inherit the "nosymlinks" flag. */ | 47 | xfs_sysctl_val_t inherit_nosym; /* Inherit the "nosymlinks" flag. */ |
48 | xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */ | 48 | xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */ |
49 | xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */ | 49 | xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */ |
50 | xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */ | ||
50 | } xfs_param_t; | 51 | } xfs_param_t; |
51 | 52 | ||
52 | /* | 53 | /* |
@@ -86,6 +87,7 @@ enum { | |||
86 | XFS_INHERIT_NOSYM = 19, | 87 | XFS_INHERIT_NOSYM = 19, |
87 | XFS_ROTORSTEP = 20, | 88 | XFS_ROTORSTEP = 20, |
88 | XFS_INHERIT_NODFRG = 21, | 89 | XFS_INHERIT_NODFRG = 21, |
90 | XFS_FILESTREAM_TIMER = 22, | ||
89 | }; | 91 | }; |
90 | 92 | ||
91 | extern xfs_param_t xfs_params; | 93 | extern xfs_param_t xfs_params; |
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h index bf0a12040b13..b5a7d92c6843 100644 --- a/fs/xfs/xfs.h +++ b/fs/xfs/xfs.h | |||
@@ -38,6 +38,7 @@ | |||
38 | #define XFS_RW_TRACE 1 | 38 | #define XFS_RW_TRACE 1 |
39 | #define XFS_BUF_TRACE 1 | 39 | #define XFS_BUF_TRACE 1 |
40 | #define XFS_VNODE_TRACE 1 | 40 | #define XFS_VNODE_TRACE 1 |
41 | #define XFS_FILESTREAMS_TRACE 1 | ||
41 | #endif | 42 | #endif |
42 | 43 | ||
43 | #include <linux-2.6/xfs_linux.h> | 44 | #include <linux-2.6/xfs_linux.h> |
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index b1dd0029c60e..51c09c114a20 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h | |||
@@ -196,6 +196,7 @@ typedef struct xfs_perag | |||
196 | lock_t pagb_lock; /* lock for pagb_list */ | 196 | lock_t pagb_lock; /* lock for pagb_list */ |
197 | #endif | 197 | #endif |
198 | xfs_perag_busy_t *pagb_list; /* unstable blocks */ | 198 | xfs_perag_busy_t *pagb_list; /* unstable blocks */ |
199 | atomic_t pagf_fstrms; /* # of filestreams active in this AG */ | ||
199 | } xfs_perag_t; | 200 | } xfs_perag_t; |
200 | 201 | ||
201 | #define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels) | 202 | #define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels) |
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 09d86388bb71..51ba689a4552 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c | |||
@@ -52,6 +52,7 @@ | |||
52 | #include "xfs_quota.h" | 52 | #include "xfs_quota.h" |
53 | #include "xfs_trans_space.h" | 53 | #include "xfs_trans_space.h" |
54 | #include "xfs_buf_item.h" | 54 | #include "xfs_buf_item.h" |
55 | #include "xfs_filestream.h" | ||
55 | 56 | ||
56 | 57 | ||
57 | #ifdef DEBUG | 58 | #ifdef DEBUG |
@@ -2725,9 +2726,15 @@ xfs_bmap_btalloc( | |||
2725 | } | 2726 | } |
2726 | nullfb = ap->firstblock == NULLFSBLOCK; | 2727 | nullfb = ap->firstblock == NULLFSBLOCK; |
2727 | fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock); | 2728 | fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock); |
2728 | if (nullfb) | 2729 | if (nullfb) { |
2729 | ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino); | 2730 | if (ap->userdata && xfs_inode_is_filestream(ap->ip)) { |
2730 | else | 2731 | ag = xfs_filestream_lookup_ag(ap->ip); |
2732 | ag = (ag != NULLAGNUMBER) ? ag : 0; | ||
2733 | ap->rval = XFS_AGB_TO_FSB(mp, ag, 0); | ||
2734 | } else { | ||
2735 | ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino); | ||
2736 | } | ||
2737 | } else | ||
2731 | ap->rval = ap->firstblock; | 2738 | ap->rval = ap->firstblock; |
2732 | 2739 | ||
2733 | xfs_bmap_adjacent(ap); | 2740 | xfs_bmap_adjacent(ap); |
@@ -2751,13 +2758,22 @@ xfs_bmap_btalloc( | |||
2751 | args.firstblock = ap->firstblock; | 2758 | args.firstblock = ap->firstblock; |
2752 | blen = 0; | 2759 | blen = 0; |
2753 | if (nullfb) { | 2760 | if (nullfb) { |
2754 | args.type = XFS_ALLOCTYPE_START_BNO; | 2761 | if (ap->userdata && xfs_inode_is_filestream(ap->ip)) |
2762 | args.type = XFS_ALLOCTYPE_NEAR_BNO; | ||
2763 | else | ||
2764 | args.type = XFS_ALLOCTYPE_START_BNO; | ||
2755 | args.total = ap->total; | 2765 | args.total = ap->total; |
2766 | |||
2756 | /* | 2767 | /* |
2757 | * Find the longest available space. | 2768 | * Search for an allocation group with a single extent |
2758 | * We're going to try for the whole allocation at once. | 2769 | * large enough for the request. |
2770 | * | ||
2771 | * If one isn't found, then adjust the minimum allocation | ||
2772 | * size to the largest space found. | ||
2759 | */ | 2773 | */ |
2760 | startag = ag = XFS_FSB_TO_AGNO(mp, args.fsbno); | 2774 | startag = ag = XFS_FSB_TO_AGNO(mp, args.fsbno); |
2775 | if (startag == NULLAGNUMBER) | ||
2776 | startag = ag = 0; | ||
2761 | notinit = 0; | 2777 | notinit = 0; |
2762 | down_read(&mp->m_peraglock); | 2778 | down_read(&mp->m_peraglock); |
2763 | while (blen < ap->alen) { | 2779 | while (blen < ap->alen) { |
@@ -2783,6 +2799,35 @@ xfs_bmap_btalloc( | |||
2783 | blen = longest; | 2799 | blen = longest; |
2784 | } else | 2800 | } else |
2785 | notinit = 1; | 2801 | notinit = 1; |
2802 | |||
2803 | if (xfs_inode_is_filestream(ap->ip)) { | ||
2804 | if (blen >= ap->alen) | ||
2805 | break; | ||
2806 | |||
2807 | if (ap->userdata) { | ||
2808 | /* | ||
2809 | * If startag is an invalid AG, we've | ||
2810 | * come here once before and | ||
2811 | * xfs_filestream_new_ag picked the | ||
2812 | * best currently available. | ||
2813 | * | ||
2814 | * Don't continue looping, since we | ||
2815 | * could loop forever. | ||
2816 | */ | ||
2817 | if (startag == NULLAGNUMBER) | ||
2818 | break; | ||
2819 | |||
2820 | error = xfs_filestream_new_ag(ap, &ag); | ||
2821 | if (error) { | ||
2822 | up_read(&mp->m_peraglock); | ||
2823 | return error; | ||
2824 | } | ||
2825 | |||
2826 | /* loop again to set 'blen'*/ | ||
2827 | startag = NULLAGNUMBER; | ||
2828 | continue; | ||
2829 | } | ||
2830 | } | ||
2786 | if (++ag == mp->m_sb.sb_agcount) | 2831 | if (++ag == mp->m_sb.sb_agcount) |
2787 | ag = 0; | 2832 | ag = 0; |
2788 | if (ag == startag) | 2833 | if (ag == startag) |
@@ -2807,8 +2852,18 @@ xfs_bmap_btalloc( | |||
2807 | */ | 2852 | */ |
2808 | else | 2853 | else |
2809 | args.minlen = ap->alen; | 2854 | args.minlen = ap->alen; |
2855 | |||
2856 | /* | ||
2857 | * set the failure fallback case to look in the selected | ||
2858 | * AG as the stream may have moved. | ||
2859 | */ | ||
2860 | if (xfs_inode_is_filestream(ap->ip)) | ||
2861 | ap->rval = args.fsbno = XFS_AGB_TO_FSB(mp, ag, 0); | ||
2810 | } else if (ap->low) { | 2862 | } else if (ap->low) { |
2811 | args.type = XFS_ALLOCTYPE_START_BNO; | 2863 | if (xfs_inode_is_filestream(ap->ip)) |
2864 | args.type = XFS_ALLOCTYPE_FIRST_AG; | ||
2865 | else | ||
2866 | args.type = XFS_ALLOCTYPE_START_BNO; | ||
2812 | args.total = args.minlen = ap->minlen; | 2867 | args.total = args.minlen = ap->minlen; |
2813 | } else { | 2868 | } else { |
2814 | args.type = XFS_ALLOCTYPE_NEAR_BNO; | 2869 | args.type = XFS_ALLOCTYPE_NEAR_BNO; |
diff --git a/fs/xfs/xfs_clnt.h b/fs/xfs/xfs_clnt.h index 5b7eb81453be..f89196cb08d2 100644 --- a/fs/xfs/xfs_clnt.h +++ b/fs/xfs/xfs_clnt.h | |||
@@ -99,5 +99,7 @@ struct xfs_mount_args { | |||
99 | */ | 99 | */ |
100 | #define XFSMNT2_COMPAT_IOSIZE 0x00000001 /* don't report large preferred | 100 | #define XFSMNT2_COMPAT_IOSIZE 0x00000001 /* don't report large preferred |
101 | * I/O size in stat(2) */ | 101 | * I/O size in stat(2) */ |
102 | #define XFSMNT2_FILESTREAMS 0x00000002 /* enable the filestreams | ||
103 | * allocator */ | ||
102 | 104 | ||
103 | #endif /* __XFS_CLNT_H__ */ | 105 | #endif /* __XFS_CLNT_H__ */ |
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h index b33826961c45..fefd0116bac9 100644 --- a/fs/xfs/xfs_dinode.h +++ b/fs/xfs/xfs_dinode.h | |||
@@ -257,6 +257,7 @@ typedef enum xfs_dinode_fmt | |||
257 | #define XFS_DIFLAG_EXTSIZE_BIT 11 /* inode extent size allocator hint */ | 257 | #define XFS_DIFLAG_EXTSIZE_BIT 11 /* inode extent size allocator hint */ |
258 | #define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */ | 258 | #define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */ |
259 | #define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */ | 259 | #define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */ |
260 | #define XFS_DIFLAG_FILESTREAM_BIT 14 /* use filestream allocator */ | ||
260 | #define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT) | 261 | #define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT) |
261 | #define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT) | 262 | #define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT) |
262 | #define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT) | 263 | #define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT) |
@@ -271,12 +272,13 @@ typedef enum xfs_dinode_fmt | |||
271 | #define XFS_DIFLAG_EXTSIZE (1 << XFS_DIFLAG_EXTSIZE_BIT) | 272 | #define XFS_DIFLAG_EXTSIZE (1 << XFS_DIFLAG_EXTSIZE_BIT) |
272 | #define XFS_DIFLAG_EXTSZINHERIT (1 << XFS_DIFLAG_EXTSZINHERIT_BIT) | 273 | #define XFS_DIFLAG_EXTSZINHERIT (1 << XFS_DIFLAG_EXTSZINHERIT_BIT) |
273 | #define XFS_DIFLAG_NODEFRAG (1 << XFS_DIFLAG_NODEFRAG_BIT) | 274 | #define XFS_DIFLAG_NODEFRAG (1 << XFS_DIFLAG_NODEFRAG_BIT) |
275 | #define XFS_DIFLAG_FILESTREAM (1 << XFS_DIFLAG_FILESTREAM_BIT) | ||
274 | 276 | ||
275 | #define XFS_DIFLAG_ANY \ | 277 | #define XFS_DIFLAG_ANY \ |
276 | (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \ | 278 | (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \ |
277 | XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \ | 279 | XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \ |
278 | XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \ | 280 | XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \ |
279 | XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \ | 281 | XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \ |
280 | XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG) | 282 | XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM) |
281 | 283 | ||
282 | #endif /* __XFS_DINODE_H__ */ | 284 | #endif /* __XFS_DINODE_H__ */ |
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c new file mode 100644 index 000000000000..ce2278611bb7 --- /dev/null +++ b/fs/xfs/xfs_filestream.c | |||
@@ -0,0 +1,771 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006-2007 Silicon Graphics, Inc. | ||
3 | * All Rights Reserved. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or | ||
6 | * modify it under the terms of the GNU General Public License as | ||
7 | * published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope that it would be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write the Free Software Foundation, | ||
16 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA | ||
17 | */ | ||
18 | #include "xfs.h" | ||
19 | #include "xfs_bmap_btree.h" | ||
20 | #include "xfs_inum.h" | ||
21 | #include "xfs_dir2.h" | ||
22 | #include "xfs_dir2_sf.h" | ||
23 | #include "xfs_attr_sf.h" | ||
24 | #include "xfs_dinode.h" | ||
25 | #include "xfs_inode.h" | ||
26 | #include "xfs_ag.h" | ||
27 | #include "xfs_dmapi.h" | ||
28 | #include "xfs_log.h" | ||
29 | #include "xfs_trans.h" | ||
30 | #include "xfs_sb.h" | ||
31 | #include "xfs_mount.h" | ||
32 | #include "xfs_bmap.h" | ||
33 | #include "xfs_alloc.h" | ||
34 | #include "xfs_utils.h" | ||
35 | #include "xfs_mru_cache.h" | ||
36 | #include "xfs_filestream.h" | ||
37 | |||
38 | #ifdef XFS_FILESTREAMS_TRACE | ||
39 | |||
40 | ktrace_t *xfs_filestreams_trace_buf; | ||
41 | |||
42 | STATIC void | ||
43 | xfs_filestreams_trace( | ||
44 | xfs_mount_t *mp, /* mount point */ | ||
45 | int type, /* type of trace */ | ||
46 | const char *func, /* source function */ | ||
47 | int line, /* source line number */ | ||
48 | __psunsigned_t arg0, | ||
49 | __psunsigned_t arg1, | ||
50 | __psunsigned_t arg2, | ||
51 | __psunsigned_t arg3, | ||
52 | __psunsigned_t arg4, | ||
53 | __psunsigned_t arg5) | ||
54 | { | ||
55 | ktrace_enter(xfs_filestreams_trace_buf, | ||
56 | (void *)(__psint_t)(type | (line << 16)), | ||
57 | (void *)func, | ||
58 | (void *)(__psunsigned_t)current_pid(), | ||
59 | (void *)mp, | ||
60 | (void *)(__psunsigned_t)arg0, | ||
61 | (void *)(__psunsigned_t)arg1, | ||
62 | (void *)(__psunsigned_t)arg2, | ||
63 | (void *)(__psunsigned_t)arg3, | ||
64 | (void *)(__psunsigned_t)arg4, | ||
65 | (void *)(__psunsigned_t)arg5, | ||
66 | NULL, NULL, NULL, NULL, NULL, NULL); | ||
67 | } | ||
68 | |||
69 | #define TRACE0(mp,t) TRACE6(mp,t,0,0,0,0,0,0) | ||
70 | #define TRACE1(mp,t,a0) TRACE6(mp,t,a0,0,0,0,0,0) | ||
71 | #define TRACE2(mp,t,a0,a1) TRACE6(mp,t,a0,a1,0,0,0,0) | ||
72 | #define TRACE3(mp,t,a0,a1,a2) TRACE6(mp,t,a0,a1,a2,0,0,0) | ||
73 | #define TRACE4(mp,t,a0,a1,a2,a3) TRACE6(mp,t,a0,a1,a2,a3,0,0) | ||
74 | #define TRACE5(mp,t,a0,a1,a2,a3,a4) TRACE6(mp,t,a0,a1,a2,a3,a4,0) | ||
75 | #define TRACE6(mp,t,a0,a1,a2,a3,a4,a5) \ | ||
76 | xfs_filestreams_trace(mp, t, __FUNCTION__, __LINE__, \ | ||
77 | (__psunsigned_t)a0, (__psunsigned_t)a1, \ | ||
78 | (__psunsigned_t)a2, (__psunsigned_t)a3, \ | ||
79 | (__psunsigned_t)a4, (__psunsigned_t)a5) | ||
80 | |||
81 | #define TRACE_AG_SCAN(mp, ag, ag2) \ | ||
82 | TRACE2(mp, XFS_FSTRM_KTRACE_AGSCAN, ag, ag2); | ||
83 | #define TRACE_AG_PICK1(mp, max_ag, maxfree) \ | ||
84 | TRACE2(mp, XFS_FSTRM_KTRACE_AGPICK1, max_ag, maxfree); | ||
85 | #define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag) \ | ||
86 | TRACE6(mp, XFS_FSTRM_KTRACE_AGPICK2, ag, ag2, \ | ||
87 | cnt, free, scan, flag) | ||
88 | #define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2) \ | ||
89 | TRACE5(mp, XFS_FSTRM_KTRACE_UPDATE, ip, ag, cnt, ag2, cnt2) | ||
90 | #define TRACE_FREE(mp, ip, pip, ag, cnt) \ | ||
91 | TRACE4(mp, XFS_FSTRM_KTRACE_FREE, ip, pip, ag, cnt) | ||
92 | #define TRACE_LOOKUP(mp, ip, pip, ag, cnt) \ | ||
93 | TRACE4(mp, XFS_FSTRM_KTRACE_ITEM_LOOKUP, ip, pip, ag, cnt) | ||
94 | #define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt) \ | ||
95 | TRACE4(mp, XFS_FSTRM_KTRACE_ASSOCIATE, ip, pip, ag, cnt) | ||
96 | #define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt) \ | ||
97 | TRACE6(mp, XFS_FSTRM_KTRACE_MOVEAG, ip, pip, oag, ocnt, nag, ncnt) | ||
98 | #define TRACE_ORPHAN(mp, ip, ag) \ | ||
99 | TRACE2(mp, XFS_FSTRM_KTRACE_ORPHAN, ip, ag); | ||
100 | |||
101 | |||
102 | #else | ||
103 | #define TRACE_AG_SCAN(mp, ag, ag2) | ||
104 | #define TRACE_AG_PICK1(mp, max_ag, maxfree) | ||
105 | #define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag) | ||
106 | #define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2) | ||
107 | #define TRACE_FREE(mp, ip, pip, ag, cnt) | ||
108 | #define TRACE_LOOKUP(mp, ip, pip, ag, cnt) | ||
109 | #define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt) | ||
110 | #define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt) | ||
111 | #define TRACE_ORPHAN(mp, ip, ag) | ||
112 | #endif | ||
113 | |||
114 | static kmem_zone_t *item_zone; | ||
115 | |||
116 | /* | ||
117 | * Structure for associating a file or a directory with an allocation group. | ||
118 | * The parent directory pointer is only needed for files, but since there will | ||
119 | * generally be vastly more files than directories in the cache, using the same | ||
120 | * data structure simplifies the code with very little memory overhead. | ||
121 | */ | ||
122 | typedef struct fstrm_item | ||
123 | { | ||
124 | xfs_agnumber_t ag; /* AG currently in use for the file/directory. */ | ||
125 | xfs_inode_t *ip; /* inode self-pointer. */ | ||
126 | xfs_inode_t *pip; /* Parent directory inode pointer. */ | ||
127 | } fstrm_item_t; | ||
128 | |||
129 | |||
130 | /* | ||
131 | * Scan the AGs starting at startag looking for an AG that isn't in use and has | ||
132 | * at least minlen blocks free. | ||
133 | */ | ||
134 | static int | ||
135 | _xfs_filestream_pick_ag( | ||
136 | xfs_mount_t *mp, | ||
137 | xfs_agnumber_t startag, | ||
138 | xfs_agnumber_t *agp, | ||
139 | int flags, | ||
140 | xfs_extlen_t minlen) | ||
141 | { | ||
142 | int err, trylock, nscan; | ||
143 | xfs_extlen_t delta, longest, need, free, minfree, maxfree = 0; | ||
144 | xfs_agnumber_t ag, max_ag = NULLAGNUMBER; | ||
145 | struct xfs_perag *pag; | ||
146 | |||
147 | /* 2% of an AG's blocks must be free for it to be chosen. */ | ||
148 | minfree = mp->m_sb.sb_agblocks / 50; | ||
149 | |||
150 | ag = startag; | ||
151 | *agp = NULLAGNUMBER; | ||
152 | |||
153 | /* For the first pass, don't sleep trying to init the per-AG. */ | ||
154 | trylock = XFS_ALLOC_FLAG_TRYLOCK; | ||
155 | |||
156 | for (nscan = 0; 1; nscan++) { | ||
157 | |||
158 | TRACE_AG_SCAN(mp, ag, xfs_filestream_peek_ag(mp, ag)); | ||
159 | |||
160 | pag = mp->m_perag + ag; | ||
161 | |||
162 | if (!pag->pagf_init) { | ||
163 | err = xfs_alloc_pagf_init(mp, NULL, ag, trylock); | ||
164 | if (err && !trylock) | ||
165 | return err; | ||
166 | } | ||
167 | |||
168 | /* Might fail sometimes during the 1st pass with trylock set. */ | ||
169 | if (!pag->pagf_init) | ||
170 | goto next_ag; | ||
171 | |||
172 | /* Keep track of the AG with the most free blocks. */ | ||
173 | if (pag->pagf_freeblks > maxfree) { | ||
174 | maxfree = pag->pagf_freeblks; | ||
175 | max_ag = ag; | ||
176 | } | ||
177 | |||
178 | /* | ||
179 | * The AG reference count does two things: it enforces mutual | ||
180 | * exclusion when examining the suitability of an AG in this | ||
181 | * loop, and it guards against two filestreams being established | ||
182 | * in the same AG as each other. | ||
183 | */ | ||
184 | if (xfs_filestream_get_ag(mp, ag) > 1) { | ||
185 | xfs_filestream_put_ag(mp, ag); | ||
186 | goto next_ag; | ||
187 | } | ||
188 | |||
189 | need = XFS_MIN_FREELIST_PAG(pag, mp); | ||
190 | delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0; | ||
191 | longest = (pag->pagf_longest > delta) ? | ||
192 | (pag->pagf_longest - delta) : | ||
193 | (pag->pagf_flcount > 0 || pag->pagf_longest > 0); | ||
194 | |||
195 | if (((minlen && longest >= minlen) || | ||
196 | (!minlen && pag->pagf_freeblks >= minfree)) && | ||
197 | (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) || | ||
198 | (flags & XFS_PICK_LOWSPACE))) { | ||
199 | |||
200 | /* Break out, retaining the reference on the AG. */ | ||
201 | free = pag->pagf_freeblks; | ||
202 | *agp = ag; | ||
203 | break; | ||
204 | } | ||
205 | |||
206 | /* Drop the reference on this AG, it's not usable. */ | ||
207 | xfs_filestream_put_ag(mp, ag); | ||
208 | next_ag: | ||
209 | /* Move to the next AG, wrapping to AG 0 if necessary. */ | ||
210 | if (++ag >= mp->m_sb.sb_agcount) | ||
211 | ag = 0; | ||
212 | |||
213 | /* If a full pass of the AGs hasn't been done yet, continue. */ | ||
214 | if (ag != startag) | ||
215 | continue; | ||
216 | |||
217 | /* Allow sleeping in xfs_alloc_pagf_init() on the 2nd pass. */ | ||
218 | if (trylock != 0) { | ||
219 | trylock = 0; | ||
220 | continue; | ||
221 | } | ||
222 | |||
223 | /* Finally, if lowspace wasn't set, set it for the 3rd pass. */ | ||
224 | if (!(flags & XFS_PICK_LOWSPACE)) { | ||
225 | flags |= XFS_PICK_LOWSPACE; | ||
226 | continue; | ||
227 | } | ||
228 | |||
229 | /* | ||
230 | * Take the AG with the most free space, regardless of whether | ||
231 | * it's already in use by another filestream. | ||
232 | */ | ||
233 | if (max_ag != NULLAGNUMBER) { | ||
234 | xfs_filestream_get_ag(mp, max_ag); | ||
235 | TRACE_AG_PICK1(mp, max_ag, maxfree); | ||
236 | free = maxfree; | ||
237 | *agp = max_ag; | ||
238 | break; | ||
239 | } | ||
240 | |||
241 | /* take AG 0 if none matched */ | ||
242 | TRACE_AG_PICK1(mp, max_ag, maxfree); | ||
243 | *agp = 0; | ||
244 | return 0; | ||
245 | } | ||
246 | |||
247 | TRACE_AG_PICK2(mp, startag, *agp, xfs_filestream_peek_ag(mp, *agp), | ||
248 | free, nscan, flags); | ||
249 | |||
250 | return 0; | ||
251 | } | ||
252 | |||
253 | /* | ||
254 | * Set the allocation group number for a file or a directory, updating inode | ||
255 | * references and per-AG references as appropriate. Must be called with the | ||
256 | * m_peraglock held in read mode. | ||
257 | */ | ||
258 | static int | ||
259 | _xfs_filestream_update_ag( | ||
260 | xfs_inode_t *ip, | ||
261 | xfs_inode_t *pip, | ||
262 | xfs_agnumber_t ag) | ||
263 | { | ||
264 | int err = 0; | ||
265 | xfs_mount_t *mp; | ||
266 | xfs_mru_cache_t *cache; | ||
267 | fstrm_item_t *item; | ||
268 | xfs_agnumber_t old_ag; | ||
269 | xfs_inode_t *old_pip; | ||
270 | |||
271 | /* | ||
272 | * Either ip is a regular file and pip is a directory, or ip is a | ||
273 | * directory and pip is NULL. | ||
274 | */ | ||
275 | ASSERT(ip && (((ip->i_d.di_mode & S_IFREG) && pip && | ||
276 | (pip->i_d.di_mode & S_IFDIR)) || | ||
277 | ((ip->i_d.di_mode & S_IFDIR) && !pip))); | ||
278 | |||
279 | mp = ip->i_mount; | ||
280 | cache = mp->m_filestream; | ||
281 | |||
282 | item = xfs_mru_cache_lookup(cache, ip->i_ino); | ||
283 | if (item) { | ||
284 | ASSERT(item->ip == ip); | ||
285 | old_ag = item->ag; | ||
286 | item->ag = ag; | ||
287 | old_pip = item->pip; | ||
288 | item->pip = pip; | ||
289 | xfs_mru_cache_done(cache); | ||
290 | |||
291 | /* | ||
292 | * If the AG has changed, drop the old ref and take a new one, | ||
293 | * effectively transferring the reference from old to new AG. | ||
294 | */ | ||
295 | if (ag != old_ag) { | ||
296 | xfs_filestream_put_ag(mp, old_ag); | ||
297 | xfs_filestream_get_ag(mp, ag); | ||
298 | } | ||
299 | |||
300 | /* | ||
301 | * If ip is a file and its pip has changed, drop the old ref and | ||
302 | * take a new one. | ||
303 | */ | ||
304 | if (pip && pip != old_pip) { | ||
305 | IRELE(old_pip); | ||
306 | IHOLD(pip); | ||
307 | } | ||
308 | |||
309 | TRACE_UPDATE(mp, ip, old_ag, xfs_filestream_peek_ag(mp, old_ag), | ||
310 | ag, xfs_filestream_peek_ag(mp, ag)); | ||
311 | return 0; | ||
312 | } | ||
313 | |||
314 | item = kmem_zone_zalloc(item_zone, KM_MAYFAIL); | ||
315 | if (!item) | ||
316 | return ENOMEM; | ||
317 | |||
318 | item->ag = ag; | ||
319 | item->ip = ip; | ||
320 | item->pip = pip; | ||
321 | |||
322 | err = xfs_mru_cache_insert(cache, ip->i_ino, item); | ||
323 | if (err) { | ||
324 | kmem_zone_free(item_zone, item); | ||
325 | return err; | ||
326 | } | ||
327 | |||
328 | /* Take a reference on the AG. */ | ||
329 | xfs_filestream_get_ag(mp, ag); | ||
330 | |||
331 | /* | ||
332 | * Take a reference on the inode itself regardless of whether it's a | ||
333 | * regular file or a directory. | ||
334 | */ | ||
335 | IHOLD(ip); | ||
336 | |||
337 | /* | ||
338 | * In the case of a regular file, take a reference on the parent inode | ||
339 | * as well to ensure it remains in-core. | ||
340 | */ | ||
341 | if (pip) | ||
342 | IHOLD(pip); | ||
343 | |||
344 | TRACE_UPDATE(mp, ip, ag, xfs_filestream_peek_ag(mp, ag), | ||
345 | ag, xfs_filestream_peek_ag(mp, ag)); | ||
346 | |||
347 | return 0; | ||
348 | } | ||
349 | |||
350 | /* xfs_fstrm_free_func(): callback for freeing cached stream items. */ | ||
351 | void | ||
352 | xfs_fstrm_free_func( | ||
353 | xfs_ino_t ino, | ||
354 | fstrm_item_t *item) | ||
355 | { | ||
356 | xfs_inode_t *ip = item->ip; | ||
357 | int ref; | ||
358 | |||
359 | ASSERT(ip->i_ino == ino); | ||
360 | |||
361 | xfs_iflags_clear(ip, XFS_IFILESTREAM); | ||
362 | |||
363 | /* Drop the reference taken on the AG when the item was added. */ | ||
364 | ref = xfs_filestream_put_ag(ip->i_mount, item->ag); | ||
365 | |||
366 | ASSERT(ref >= 0); | ||
367 | TRACE_FREE(ip->i_mount, ip, item->pip, item->ag, | ||
368 | xfs_filestream_peek_ag(ip->i_mount, item->ag)); | ||
369 | |||
370 | /* | ||
371 | * _xfs_filestream_update_ag() always takes a reference on the inode | ||
372 | * itself, whether it's a file or a directory. Release it here. | ||
373 | * This can result in the inode being freed and so we must | ||
374 | * not hold any inode locks when freeing filesstreams objects | ||
375 | * otherwise we can deadlock here. | ||
376 | */ | ||
377 | IRELE(ip); | ||
378 | |||
379 | /* | ||
380 | * In the case of a regular file, _xfs_filestream_update_ag() also | ||
381 | * takes a ref on the parent inode to keep it in-core. Release that | ||
382 | * too. | ||
383 | */ | ||
384 | if (item->pip) | ||
385 | IRELE(item->pip); | ||
386 | |||
387 | /* Finally, free the memory allocated for the item. */ | ||
388 | kmem_zone_free(item_zone, item); | ||
389 | } | ||
390 | |||
391 | /* | ||
392 | * xfs_filestream_init() is called at xfs initialisation time to set up the | ||
393 | * memory zone that will be used for filestream data structure allocation. | ||
394 | */ | ||
395 | int | ||
396 | xfs_filestream_init(void) | ||
397 | { | ||
398 | item_zone = kmem_zone_init(sizeof(fstrm_item_t), "fstrm_item"); | ||
399 | #ifdef XFS_FILESTREAMS_TRACE | ||
400 | xfs_filestreams_trace_buf = ktrace_alloc(XFS_FSTRM_KTRACE_SIZE, KM_SLEEP); | ||
401 | #endif | ||
402 | return item_zone ? 0 : -ENOMEM; | ||
403 | } | ||
404 | |||
405 | /* | ||
406 | * xfs_filestream_uninit() is called at xfs termination time to destroy the | ||
407 | * memory zone that was used for filestream data structure allocation. | ||
408 | */ | ||
409 | void | ||
410 | xfs_filestream_uninit(void) | ||
411 | { | ||
412 | #ifdef XFS_FILESTREAMS_TRACE | ||
413 | ktrace_free(xfs_filestreams_trace_buf); | ||
414 | #endif | ||
415 | kmem_zone_destroy(item_zone); | ||
416 | } | ||
417 | |||
418 | /* | ||
419 | * xfs_filestream_mount() is called when a file system is mounted with the | ||
420 | * filestream option. It is responsible for allocating the data structures | ||
421 | * needed to track the new file system's file streams. | ||
422 | */ | ||
423 | int | ||
424 | xfs_filestream_mount( | ||
425 | xfs_mount_t *mp) | ||
426 | { | ||
427 | int err; | ||
428 | unsigned int lifetime, grp_count; | ||
429 | |||
430 | /* | ||
431 | * The filestream timer tunable is currently fixed within the range of | ||
432 | * one second to four minutes, with five seconds being the default. The | ||
433 | * group count is somewhat arbitrary, but it'd be nice to adhere to the | ||
434 | * timer tunable to within about 10 percent. This requires at least 10 | ||
435 | * groups. | ||
436 | */ | ||
437 | lifetime = xfs_fstrm_centisecs * 10; | ||
438 | grp_count = 10; | ||
439 | |||
440 | err = xfs_mru_cache_create(&mp->m_filestream, lifetime, grp_count, | ||
441 | (xfs_mru_cache_free_func_t)xfs_fstrm_free_func); | ||
442 | |||
443 | return err; | ||
444 | } | ||
445 | |||
446 | /* | ||
447 | * xfs_filestream_unmount() is called when a file system that was mounted with | ||
448 | * the filestream option is unmounted. It drains the data structures created | ||
449 | * to track the file system's file streams and frees all the memory that was | ||
450 | * allocated. | ||
451 | */ | ||
452 | void | ||
453 | xfs_filestream_unmount( | ||
454 | xfs_mount_t *mp) | ||
455 | { | ||
456 | xfs_mru_cache_destroy(mp->m_filestream); | ||
457 | } | ||
458 | |||
459 | /* | ||
460 | * If the mount point's m_perag array is going to be reallocated, all | ||
461 | * outstanding cache entries must be flushed to avoid accessing reference count | ||
462 | * addresses that have been freed. The call to xfs_filestream_flush() must be | ||
463 | * made inside the block that holds the m_peraglock in write mode to do the | ||
464 | * reallocation. | ||
465 | */ | ||
466 | void | ||
467 | xfs_filestream_flush( | ||
468 | xfs_mount_t *mp) | ||
469 | { | ||
470 | /* point in time flush, so keep the reaper running */ | ||
471 | xfs_mru_cache_flush(mp->m_filestream, 1); | ||
472 | } | ||
473 | |||
474 | /* | ||
475 | * Return the AG of the filestream the file or directory belongs to, or | ||
476 | * NULLAGNUMBER otherwise. | ||
477 | */ | ||
478 | xfs_agnumber_t | ||
479 | xfs_filestream_lookup_ag( | ||
480 | xfs_inode_t *ip) | ||
481 | { | ||
482 | xfs_mru_cache_t *cache; | ||
483 | fstrm_item_t *item; | ||
484 | xfs_agnumber_t ag; | ||
485 | int ref; | ||
486 | |||
487 | if (!(ip->i_d.di_mode & (S_IFREG | S_IFDIR))) { | ||
488 | ASSERT(0); | ||
489 | return NULLAGNUMBER; | ||
490 | } | ||
491 | |||
492 | cache = ip->i_mount->m_filestream; | ||
493 | item = xfs_mru_cache_lookup(cache, ip->i_ino); | ||
494 | if (!item) { | ||
495 | TRACE_LOOKUP(ip->i_mount, ip, NULL, NULLAGNUMBER, 0); | ||
496 | return NULLAGNUMBER; | ||
497 | } | ||
498 | |||
499 | ASSERT(ip == item->ip); | ||
500 | ag = item->ag; | ||
501 | ref = xfs_filestream_peek_ag(ip->i_mount, ag); | ||
502 | xfs_mru_cache_done(cache); | ||
503 | |||
504 | TRACE_LOOKUP(ip->i_mount, ip, item->pip, ag, ref); | ||
505 | return ag; | ||
506 | } | ||
507 | |||
508 | /* | ||
509 | * xfs_filestream_associate() should only be called to associate a regular file | ||
510 | * with its parent directory. Calling it with a child directory isn't | ||
511 | * appropriate because filestreams don't apply to entire directory hierarchies. | ||
512 | * Creating a file in a child directory of an existing filestream directory | ||
513 | * starts a new filestream with its own allocation group association. | ||
514 | * | ||
515 | * Returns < 0 on error, 0 if successful association occurred, > 0 if | ||
516 | * we failed to get an association because of locking issues. | ||
517 | */ | ||
518 | int | ||
519 | xfs_filestream_associate( | ||
520 | xfs_inode_t *pip, | ||
521 | xfs_inode_t *ip) | ||
522 | { | ||
523 | xfs_mount_t *mp; | ||
524 | xfs_mru_cache_t *cache; | ||
525 | fstrm_item_t *item; | ||
526 | xfs_agnumber_t ag, rotorstep, startag; | ||
527 | int err = 0; | ||
528 | |||
529 | ASSERT(pip->i_d.di_mode & S_IFDIR); | ||
530 | ASSERT(ip->i_d.di_mode & S_IFREG); | ||
531 | if (!(pip->i_d.di_mode & S_IFDIR) || !(ip->i_d.di_mode & S_IFREG)) | ||
532 | return -EINVAL; | ||
533 | |||
534 | mp = pip->i_mount; | ||
535 | cache = mp->m_filestream; | ||
536 | down_read(&mp->m_peraglock); | ||
537 | |||
538 | /* | ||
539 | * We have a problem, Houston. | ||
540 | * | ||
541 | * Taking the iolock here violates inode locking order - we already | ||
542 | * hold the ilock. Hence if we block getting this lock we may never | ||
543 | * wake. Unfortunately, that means if we can't get the lock, we're | ||
544 | * screwed in terms of getting a stream association - we can't spin | ||
545 | * waiting for the lock because someone else is waiting on the lock we | ||
546 | * hold and we cannot drop that as we are in a transaction here. | ||
547 | * | ||
548 | * Lucky for us, this inversion is rarely a problem because it's a | ||
549 | * directory inode that we are trying to lock here and that means the | ||
550 | * only place that matters is xfs_sync_inodes() and SYNC_DELWRI is | ||
551 | * used. i.e. freeze, remount-ro, quotasync or unmount. | ||
552 | * | ||
553 | * So, if we can't get the iolock without sleeping then just give up | ||
554 | */ | ||
555 | if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL)) { | ||
556 | up_read(&mp->m_peraglock); | ||
557 | return 1; | ||
558 | } | ||
559 | |||
560 | /* If the parent directory is already in the cache, use its AG. */ | ||
561 | item = xfs_mru_cache_lookup(cache, pip->i_ino); | ||
562 | if (item) { | ||
563 | ASSERT(item->ip == pip); | ||
564 | ag = item->ag; | ||
565 | xfs_mru_cache_done(cache); | ||
566 | |||
567 | TRACE_LOOKUP(mp, pip, pip, ag, xfs_filestream_peek_ag(mp, ag)); | ||
568 | err = _xfs_filestream_update_ag(ip, pip, ag); | ||
569 | |||
570 | goto exit; | ||
571 | } | ||
572 | |||
573 | /* | ||
574 | * Set the starting AG using the rotor for inode32, otherwise | ||
575 | * use the directory inode's AG. | ||
576 | */ | ||
577 | if (mp->m_flags & XFS_MOUNT_32BITINODES) { | ||
578 | rotorstep = xfs_rotorstep; | ||
579 | startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount; | ||
580 | mp->m_agfrotor = (mp->m_agfrotor + 1) % | ||
581 | (mp->m_sb.sb_agcount * rotorstep); | ||
582 | } else | ||
583 | startag = XFS_INO_TO_AGNO(mp, pip->i_ino); | ||
584 | |||
585 | /* Pick a new AG for the parent inode starting at startag. */ | ||
586 | err = _xfs_filestream_pick_ag(mp, startag, &ag, 0, 0); | ||
587 | if (err || ag == NULLAGNUMBER) | ||
588 | goto exit_did_pick; | ||
589 | |||
590 | /* Associate the parent inode with the AG. */ | ||
591 | err = _xfs_filestream_update_ag(pip, NULL, ag); | ||
592 | if (err) | ||
593 | goto exit_did_pick; | ||
594 | |||
595 | /* Associate the file inode with the AG. */ | ||
596 | err = _xfs_filestream_update_ag(ip, pip, ag); | ||
597 | if (err) | ||
598 | goto exit_did_pick; | ||
599 | |||
600 | TRACE_ASSOCIATE(mp, ip, pip, ag, xfs_filestream_peek_ag(mp, ag)); | ||
601 | |||
602 | exit_did_pick: | ||
603 | /* | ||
604 | * If _xfs_filestream_pick_ag() returned a valid AG, remove the | ||
605 | * reference it took on it, since the file and directory will have taken | ||
606 | * their own now if they were successfully cached. | ||
607 | */ | ||
608 | if (ag != NULLAGNUMBER) | ||
609 | xfs_filestream_put_ag(mp, ag); | ||
610 | |||
611 | exit: | ||
612 | xfs_iunlock(pip, XFS_IOLOCK_EXCL); | ||
613 | up_read(&mp->m_peraglock); | ||
614 | return -err; | ||
615 | } | ||
616 | |||
617 | /* | ||
618 | * Pick a new allocation group for the current file and its file stream. This | ||
619 | * function is called by xfs_bmap_filestreams() with the mount point's per-ag | ||
620 | * lock held. | ||
621 | */ | ||
622 | int | ||
623 | xfs_filestream_new_ag( | ||
624 | xfs_bmalloca_t *ap, | ||
625 | xfs_agnumber_t *agp) | ||
626 | { | ||
627 | int flags, err; | ||
628 | xfs_inode_t *ip, *pip = NULL; | ||
629 | xfs_mount_t *mp; | ||
630 | xfs_mru_cache_t *cache; | ||
631 | xfs_extlen_t minlen; | ||
632 | fstrm_item_t *dir, *file; | ||
633 | xfs_agnumber_t ag = NULLAGNUMBER; | ||
634 | |||
635 | ip = ap->ip; | ||
636 | mp = ip->i_mount; | ||
637 | cache = mp->m_filestream; | ||
638 | minlen = ap->alen; | ||
639 | *agp = NULLAGNUMBER; | ||
640 | |||
641 | /* | ||
642 | * Look for the file in the cache, removing it if it's found. Doing | ||
643 | * this allows it to be held across the dir lookup that follows. | ||
644 | */ | ||
645 | file = xfs_mru_cache_remove(cache, ip->i_ino); | ||
646 | if (file) { | ||
647 | ASSERT(ip == file->ip); | ||
648 | |||
649 | /* Save the file's parent inode and old AG number for later. */ | ||
650 | pip = file->pip; | ||
651 | ag = file->ag; | ||
652 | |||
653 | /* Look for the file's directory in the cache. */ | ||
654 | dir = xfs_mru_cache_lookup(cache, pip->i_ino); | ||
655 | if (dir) { | ||
656 | ASSERT(pip == dir->ip); | ||
657 | |||
658 | /* | ||
659 | * If the directory has already moved on to a new AG, | ||
660 | * use that AG as the new AG for the file. Don't | ||
661 | * forget to twiddle the AG refcounts to match the | ||
662 | * movement. | ||
663 | */ | ||
664 | if (dir->ag != file->ag) { | ||
665 | xfs_filestream_put_ag(mp, file->ag); | ||
666 | xfs_filestream_get_ag(mp, dir->ag); | ||
667 | *agp = file->ag = dir->ag; | ||
668 | } | ||
669 | |||
670 | xfs_mru_cache_done(cache); | ||
671 | } | ||
672 | |||
673 | /* | ||
674 | * Put the file back in the cache. If this fails, the free | ||
675 | * function needs to be called to tidy up in the same way as if | ||
676 | * the item had simply expired from the cache. | ||
677 | */ | ||
678 | err = xfs_mru_cache_insert(cache, ip->i_ino, file); | ||
679 | if (err) { | ||
680 | xfs_fstrm_free_func(ip->i_ino, file); | ||
681 | return err; | ||
682 | } | ||
683 | |||
684 | /* | ||
685 | * If the file's AG was moved to the directory's new AG, there's | ||
686 | * nothing more to be done. | ||
687 | */ | ||
688 | if (*agp != NULLAGNUMBER) { | ||
689 | TRACE_MOVEAG(mp, ip, pip, | ||
690 | ag, xfs_filestream_peek_ag(mp, ag), | ||
691 | *agp, xfs_filestream_peek_ag(mp, *agp)); | ||
692 | return 0; | ||
693 | } | ||
694 | } | ||
695 | |||
696 | /* | ||
697 | * If the file's parent directory is known, take its iolock in exclusive | ||
698 | * mode to prevent two sibling files from racing each other to migrate | ||
699 | * themselves and their parent to different AGs. | ||
700 | */ | ||
701 | if (pip) | ||
702 | xfs_ilock(pip, XFS_IOLOCK_EXCL); | ||
703 | |||
704 | /* | ||
705 | * A new AG needs to be found for the file. If the file's parent | ||
706 | * directory is also known, it will be moved to the new AG as well to | ||
707 | * ensure that files created inside it in future use the new AG. | ||
708 | */ | ||
709 | ag = (ag == NULLAGNUMBER) ? 0 : (ag + 1) % mp->m_sb.sb_agcount; | ||
710 | flags = (ap->userdata ? XFS_PICK_USERDATA : 0) | | ||
711 | (ap->low ? XFS_PICK_LOWSPACE : 0); | ||
712 | |||
713 | err = _xfs_filestream_pick_ag(mp, ag, agp, flags, minlen); | ||
714 | if (err || *agp == NULLAGNUMBER) | ||
715 | goto exit; | ||
716 | |||
717 | /* | ||
718 | * If the file wasn't found in the file cache, then its parent directory | ||
719 | * inode isn't known. For this to have happened, the file must either | ||
720 | * be pre-existing, or it was created long enough ago that its cache | ||
721 | * entry has expired. This isn't the sort of usage that the filestreams | ||
722 | * allocator is trying to optimise, so there's no point trying to track | ||
723 | * its new AG somehow in the filestream data structures. | ||
724 | */ | ||
725 | if (!pip) { | ||
726 | TRACE_ORPHAN(mp, ip, *agp); | ||
727 | goto exit; | ||
728 | } | ||
729 | |||
730 | /* Associate the parent inode with the AG. */ | ||
731 | err = _xfs_filestream_update_ag(pip, NULL, *agp); | ||
732 | if (err) | ||
733 | goto exit; | ||
734 | |||
735 | /* Associate the file inode with the AG. */ | ||
736 | err = _xfs_filestream_update_ag(ip, pip, *agp); | ||
737 | if (err) | ||
738 | goto exit; | ||
739 | |||
740 | TRACE_MOVEAG(mp, ip, pip, NULLAGNUMBER, 0, | ||
741 | *agp, xfs_filestream_peek_ag(mp, *agp)); | ||
742 | |||
743 | exit: | ||
744 | /* | ||
745 | * If _xfs_filestream_pick_ag() returned a valid AG, remove the | ||
746 | * reference it took on it, since the file and directory will have taken | ||
747 | * their own now if they were successfully cached. | ||
748 | */ | ||
749 | if (*agp != NULLAGNUMBER) | ||
750 | xfs_filestream_put_ag(mp, *agp); | ||
751 | else | ||
752 | *agp = 0; | ||
753 | |||
754 | if (pip) | ||
755 | xfs_iunlock(pip, XFS_IOLOCK_EXCL); | ||
756 | |||
757 | return err; | ||
758 | } | ||
759 | |||
760 | /* | ||
761 | * Remove an association between an inode and a filestream object. | ||
762 | * Typically this is done on last close of an unlinked file. | ||
763 | */ | ||
764 | void | ||
765 | xfs_filestream_deassociate( | ||
766 | xfs_inode_t *ip) | ||
767 | { | ||
768 | xfs_mru_cache_t *cache = ip->i_mount->m_filestream; | ||
769 | |||
770 | xfs_mru_cache_delete(cache, ip->i_ino); | ||
771 | } | ||
diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h new file mode 100644 index 000000000000..f655f7dc334c --- /dev/null +++ b/fs/xfs/xfs_filestream.h | |||
@@ -0,0 +1,136 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006-2007 Silicon Graphics, Inc. | ||
3 | * All Rights Reserved. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or | ||
6 | * modify it under the terms of the GNU General Public License as | ||
7 | * published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope that it would be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write the Free Software Foundation, | ||
16 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA | ||
17 | */ | ||
18 | #ifndef __XFS_FILESTREAM_H__ | ||
19 | #define __XFS_FILESTREAM_H__ | ||
20 | |||
21 | #ifdef __KERNEL__ | ||
22 | |||
23 | struct xfs_mount; | ||
24 | struct xfs_inode; | ||
25 | struct xfs_perag; | ||
26 | struct xfs_bmalloca; | ||
27 | |||
28 | #ifdef XFS_FILESTREAMS_TRACE | ||
29 | #define XFS_FSTRM_KTRACE_INFO 1 | ||
30 | #define XFS_FSTRM_KTRACE_AGSCAN 2 | ||
31 | #define XFS_FSTRM_KTRACE_AGPICK1 3 | ||
32 | #define XFS_FSTRM_KTRACE_AGPICK2 4 | ||
33 | #define XFS_FSTRM_KTRACE_UPDATE 5 | ||
34 | #define XFS_FSTRM_KTRACE_FREE 6 | ||
35 | #define XFS_FSTRM_KTRACE_ITEM_LOOKUP 7 | ||
36 | #define XFS_FSTRM_KTRACE_ASSOCIATE 8 | ||
37 | #define XFS_FSTRM_KTRACE_MOVEAG 9 | ||
38 | #define XFS_FSTRM_KTRACE_ORPHAN 10 | ||
39 | |||
40 | #define XFS_FSTRM_KTRACE_SIZE 16384 | ||
41 | extern ktrace_t *xfs_filestreams_trace_buf; | ||
42 | |||
43 | #endif | ||
44 | |||
45 | /* | ||
46 | * Allocation group filestream associations are tracked with per-ag atomic | ||
47 | * counters. These counters allow _xfs_filestream_pick_ag() to tell whether a | ||
48 | * particular AG already has active filestreams associated with it. The mount | ||
49 | * point's m_peraglock is used to protect these counters from per-ag array | ||
50 | * re-allocation during a growfs operation. When xfs_growfs_data_private() is | ||
51 | * about to reallocate the array, it calls xfs_filestream_flush() with the | ||
52 | * m_peraglock held in write mode. | ||
53 | * | ||
54 | * Since xfs_mru_cache_flush() guarantees that all the free functions for all | ||
55 | * the cache elements have finished executing before it returns, it's safe for | ||
56 | * the free functions to use the atomic counters without m_peraglock protection. | ||
57 | * This allows the implementation of xfs_fstrm_free_func() to be agnostic about | ||
58 | * whether it was called with the m_peraglock held in read mode, write mode or | ||
59 | * not held at all. The race condition this addresses is the following: | ||
60 | * | ||
61 | * - The work queue scheduler fires and pulls a filestream directory cache | ||
62 | * element off the LRU end of the cache for deletion, then gets pre-empted. | ||
63 | * - A growfs operation grabs the m_peraglock in write mode, flushes all the | ||
64 | * remaining items from the cache and reallocates the mount point's per-ag | ||
65 | * array, resetting all the counters to zero. | ||
66 | * - The work queue thread resumes and calls the free function for the element | ||
67 | * it started cleaning up earlier. In the process it decrements the | ||
68 | * filestreams counter for an AG that now has no references. | ||
69 | * | ||
70 | * With a shrinkfs feature, the above scenario could panic the system. | ||
71 | * | ||
72 | * All other uses of the following macros should be protected by either the | ||
73 | * m_peraglock held in read mode, or the cache's internal locking exposed by the | ||
74 | * interval between a call to xfs_mru_cache_lookup() and a call to | ||
75 | * xfs_mru_cache_done(). In addition, the m_peraglock must be held in read mode | ||
76 | * when new elements are added to the cache. | ||
77 | * | ||
78 | * Combined, these locking rules ensure that no associations will ever exist in | ||
79 | * the cache that reference per-ag array elements that have since been | ||
80 | * reallocated. | ||
81 | */ | ||
82 | STATIC_INLINE int | ||
83 | xfs_filestream_peek_ag( | ||
84 | xfs_mount_t *mp, | ||
85 | xfs_agnumber_t agno) | ||
86 | { | ||
87 | return atomic_read(&mp->m_perag[agno].pagf_fstrms); | ||
88 | } | ||
89 | |||
90 | STATIC_INLINE int | ||
91 | xfs_filestream_get_ag( | ||
92 | xfs_mount_t *mp, | ||
93 | xfs_agnumber_t agno) | ||
94 | { | ||
95 | return atomic_inc_return(&mp->m_perag[agno].pagf_fstrms); | ||
96 | } | ||
97 | |||
98 | STATIC_INLINE int | ||
99 | xfs_filestream_put_ag( | ||
100 | xfs_mount_t *mp, | ||
101 | xfs_agnumber_t agno) | ||
102 | { | ||
103 | return atomic_dec_return(&mp->m_perag[agno].pagf_fstrms); | ||
104 | } | ||
105 | |||
106 | /* allocation selection flags */ | ||
107 | typedef enum xfs_fstrm_alloc { | ||
108 | XFS_PICK_USERDATA = 1, | ||
109 | XFS_PICK_LOWSPACE = 2, | ||
110 | } xfs_fstrm_alloc_t; | ||
111 | |||
112 | /* prototypes for filestream.c */ | ||
113 | int xfs_filestream_init(void); | ||
114 | void xfs_filestream_uninit(void); | ||
115 | int xfs_filestream_mount(struct xfs_mount *mp); | ||
116 | void xfs_filestream_unmount(struct xfs_mount *mp); | ||
117 | void xfs_filestream_flush(struct xfs_mount *mp); | ||
118 | xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip); | ||
119 | int xfs_filestream_associate(struct xfs_inode *dip, struct xfs_inode *ip); | ||
120 | void xfs_filestream_deassociate(struct xfs_inode *ip); | ||
121 | int xfs_filestream_new_ag(struct xfs_bmalloca *ap, xfs_agnumber_t *agp); | ||
122 | |||
123 | |||
124 | /* filestreams for the inode? */ | ||
125 | STATIC_INLINE int | ||
126 | xfs_inode_is_filestream( | ||
127 | struct xfs_inode *ip) | ||
128 | { | ||
129 | return (ip->i_mount->m_flags & XFS_MOUNT_FILESTREAMS) || | ||
130 | xfs_iflags_test(ip, XFS_IFILESTREAM) || | ||
131 | (ip->i_d.di_flags & XFS_DIFLAG_FILESTREAM); | ||
132 | } | ||
133 | |||
134 | #endif /* __KERNEL__ */ | ||
135 | |||
136 | #endif /* __XFS_FILESTREAM_H__ */ | ||
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index 1b60cfc28be5..ec3c9c27e0de 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h | |||
@@ -66,6 +66,7 @@ struct fsxattr { | |||
66 | #define XFS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */ | 66 | #define XFS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */ |
67 | #define XFS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */ | 67 | #define XFS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */ |
68 | #define XFS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */ | 68 | #define XFS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */ |
69 | #define XFS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */ | ||
69 | #define XFS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ | 70 | #define XFS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ |
70 | 71 | ||
71 | /* | 72 | /* |
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 2251a49f3e17..432e82347ed6 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c | |||
@@ -44,6 +44,7 @@ | |||
44 | #include "xfs_trans_space.h" | 44 | #include "xfs_trans_space.h" |
45 | #include "xfs_rtalloc.h" | 45 | #include "xfs_rtalloc.h" |
46 | #include "xfs_rw.h" | 46 | #include "xfs_rw.h" |
47 | #include "xfs_filestream.h" | ||
47 | 48 | ||
48 | /* | 49 | /* |
49 | * File system operations | 50 | * File system operations |
@@ -165,6 +166,7 @@ xfs_growfs_data_private( | |||
165 | new = nb - mp->m_sb.sb_dblocks; | 166 | new = nb - mp->m_sb.sb_dblocks; |
166 | oagcount = mp->m_sb.sb_agcount; | 167 | oagcount = mp->m_sb.sb_agcount; |
167 | if (nagcount > oagcount) { | 168 | if (nagcount > oagcount) { |
169 | xfs_filestream_flush(mp); | ||
168 | down_write(&mp->m_peraglock); | 170 | down_write(&mp->m_peraglock); |
169 | mp->m_perag = kmem_realloc(mp->m_perag, | 171 | mp->m_perag = kmem_realloc(mp->m_perag, |
170 | sizeof(xfs_perag_t) * nagcount, | 172 | sizeof(xfs_perag_t) * nagcount, |
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 8fdd30d9ba56..2ef100be6c4f 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c | |||
@@ -48,6 +48,7 @@ | |||
48 | #include "xfs_dir2_trace.h" | 48 | #include "xfs_dir2_trace.h" |
49 | #include "xfs_quota.h" | 49 | #include "xfs_quota.h" |
50 | #include "xfs_acl.h" | 50 | #include "xfs_acl.h" |
51 | #include "xfs_filestream.h" | ||
51 | 52 | ||
52 | #include <linux/log2.h> | 53 | #include <linux/log2.h> |
53 | 54 | ||
@@ -818,6 +819,8 @@ _xfs_dic2xflags( | |||
818 | flags |= XFS_XFLAG_EXTSZINHERIT; | 819 | flags |= XFS_XFLAG_EXTSZINHERIT; |
819 | if (di_flags & XFS_DIFLAG_NODEFRAG) | 820 | if (di_flags & XFS_DIFLAG_NODEFRAG) |
820 | flags |= XFS_XFLAG_NODEFRAG; | 821 | flags |= XFS_XFLAG_NODEFRAG; |
822 | if (di_flags & XFS_DIFLAG_FILESTREAM) | ||
823 | flags |= XFS_XFLAG_FILESTREAM; | ||
821 | } | 824 | } |
822 | 825 | ||
823 | return flags; | 826 | return flags; |
@@ -1151,7 +1154,7 @@ xfs_ialloc( | |||
1151 | /* | 1154 | /* |
1152 | * Project ids won't be stored on disk if we are using a version 1 inode. | 1155 | * Project ids won't be stored on disk if we are using a version 1 inode. |
1153 | */ | 1156 | */ |
1154 | if ( (prid != 0) && (ip->i_d.di_version == XFS_DINODE_VERSION_1)) | 1157 | if ((prid != 0) && (ip->i_d.di_version == XFS_DINODE_VERSION_1)) |
1155 | xfs_bump_ino_vers2(tp, ip); | 1158 | xfs_bump_ino_vers2(tp, ip); |
1156 | 1159 | ||
1157 | if (XFS_INHERIT_GID(pip, vp->v_vfsp)) { | 1160 | if (XFS_INHERIT_GID(pip, vp->v_vfsp)) { |
@@ -1196,8 +1199,16 @@ xfs_ialloc( | |||
1196 | flags |= XFS_ILOG_DEV; | 1199 | flags |= XFS_ILOG_DEV; |
1197 | break; | 1200 | break; |
1198 | case S_IFREG: | 1201 | case S_IFREG: |
1202 | if (xfs_inode_is_filestream(pip)) { | ||
1203 | error = xfs_filestream_associate(pip, ip); | ||
1204 | if (error < 0) | ||
1205 | return -error; | ||
1206 | if (!error) | ||
1207 | xfs_iflags_set(ip, XFS_IFILESTREAM); | ||
1208 | } | ||
1209 | /* fall through */ | ||
1199 | case S_IFDIR: | 1210 | case S_IFDIR: |
1200 | if (unlikely(pip->i_d.di_flags & XFS_DIFLAG_ANY)) { | 1211 | if (pip->i_d.di_flags & XFS_DIFLAG_ANY) { |
1201 | uint di_flags = 0; | 1212 | uint di_flags = 0; |
1202 | 1213 | ||
1203 | if ((mode & S_IFMT) == S_IFDIR) { | 1214 | if ((mode & S_IFMT) == S_IFDIR) { |
@@ -1234,6 +1245,8 @@ xfs_ialloc( | |||
1234 | if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) && | 1245 | if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) && |
1235 | xfs_inherit_nodefrag) | 1246 | xfs_inherit_nodefrag) |
1236 | di_flags |= XFS_DIFLAG_NODEFRAG; | 1247 | di_flags |= XFS_DIFLAG_NODEFRAG; |
1248 | if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) | ||
1249 | di_flags |= XFS_DIFLAG_FILESTREAM; | ||
1237 | ip->i_d.di_flags |= di_flags; | 1250 | ip->i_d.di_flags |= di_flags; |
1238 | } | 1251 | } |
1239 | /* FALLTHROUGH */ | 1252 | /* FALLTHROUGH */ |
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index f75afecef8e7..d418eeed4ebd 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h | |||
@@ -379,6 +379,7 @@ xfs_iflags_test(xfs_inode_t *ip, unsigned short flags) | |||
379 | #define XFS_ISTALE 0x0010 /* inode has been staled */ | 379 | #define XFS_ISTALE 0x0010 /* inode has been staled */ |
380 | #define XFS_IRECLAIMABLE 0x0020 /* inode can be reclaimed */ | 380 | #define XFS_IRECLAIMABLE 0x0020 /* inode can be reclaimed */ |
381 | #define XFS_INEW 0x0040 | 381 | #define XFS_INEW 0x0040 |
382 | #define XFS_IFILESTREAM 0x0080 /* inode is in a filestream directory */ | ||
382 | 383 | ||
383 | /* | 384 | /* |
384 | * Flags for inode locking. | 385 | * Flags for inode locking. |
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 0bca2d422719..76ad74758696 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h | |||
@@ -66,6 +66,7 @@ struct xfs_bmbt_irec; | |||
66 | struct xfs_bmap_free; | 66 | struct xfs_bmap_free; |
67 | struct xfs_extdelta; | 67 | struct xfs_extdelta; |
68 | struct xfs_swapext; | 68 | struct xfs_swapext; |
69 | struct xfs_mru_cache; | ||
69 | 70 | ||
70 | extern struct bhv_vfsops xfs_vfsops; | 71 | extern struct bhv_vfsops xfs_vfsops; |
71 | extern struct bhv_vnodeops xfs_vnodeops; | 72 | extern struct bhv_vnodeops xfs_vnodeops; |
@@ -424,6 +425,7 @@ typedef struct xfs_mount { | |||
424 | struct notifier_block m_icsb_notifier; /* hotplug cpu notifier */ | 425 | struct notifier_block m_icsb_notifier; /* hotplug cpu notifier */ |
425 | struct mutex m_icsb_mutex; /* balancer sync lock */ | 426 | struct mutex m_icsb_mutex; /* balancer sync lock */ |
426 | #endif | 427 | #endif |
428 | struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ | ||
427 | } xfs_mount_t; | 429 | } xfs_mount_t; |
428 | 430 | ||
429 | /* | 431 | /* |
@@ -463,6 +465,8 @@ typedef struct xfs_mount { | |||
463 | * I/O size in stat() */ | 465 | * I/O size in stat() */ |
464 | #define XFS_MOUNT_NO_PERCPU_SB (1ULL << 23) /* don't use per-cpu superblock | 466 | #define XFS_MOUNT_NO_PERCPU_SB (1ULL << 23) /* don't use per-cpu superblock |
465 | counters */ | 467 | counters */ |
468 | #define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams | ||
469 | allocator */ | ||
466 | 470 | ||
467 | 471 | ||
468 | /* | 472 | /* |
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c new file mode 100644 index 000000000000..7deb9e3cbbd3 --- /dev/null +++ b/fs/xfs/xfs_mru_cache.c | |||
@@ -0,0 +1,608 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006-2007 Silicon Graphics, Inc. | ||
3 | * All Rights Reserved. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or | ||
6 | * modify it under the terms of the GNU General Public License as | ||
7 | * published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope that it would be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write the Free Software Foundation, | ||
16 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA | ||
17 | */ | ||
18 | #include "xfs.h" | ||
19 | #include "xfs_mru_cache.h" | ||
20 | |||
21 | /* | ||
22 | * The MRU Cache data structure consists of a data store, an array of lists and | ||
23 | * a lock to protect its internal state. At initialisation time, the client | ||
24 | * supplies an element lifetime in milliseconds and a group count, as well as a | ||
25 | * function pointer to call when deleting elements. A data structure for | ||
26 | * queueing up work in the form of timed callbacks is also included. | ||
27 | * | ||
28 | * The group count controls how many lists are created, and thereby how finely | ||
29 | * the elements are grouped in time. When reaping occurs, all the elements in | ||
30 | * all the lists whose time has expired are deleted. | ||
31 | * | ||
32 | * To give an example of how this works in practice, consider a client that | ||
33 | * initialises an MRU Cache with a lifetime of ten seconds and a group count of | ||
34 | * five. Five internal lists will be created, each representing a two second | ||
35 | * period in time. When the first element is added, time zero for the data | ||
36 | * structure is initialised to the current time. | ||
37 | * | ||
38 | * All the elements added in the first two seconds are appended to the first | ||
39 | * list. Elements added in the third second go into the second list, and so on. | ||
40 | * If an element is accessed at any point, it is removed from its list and | ||
41 | * inserted at the head of the current most-recently-used list. | ||
42 | * | ||
43 | * The reaper function will have nothing to do until at least twelve seconds | ||
44 | * have elapsed since the first element was added. The reason for this is that | ||
45 | * if it were called at t=11s, there could be elements in the first list that | ||
46 | * have only been inactive for nine seconds, so it still does nothing. If it is | ||
47 | * called anywhere between t=12 and t=14 seconds, it will delete all the | ||
48 | * elements that remain in the first list. It's therefore possible for elements | ||
49 | * to remain in the data store even after they've been inactive for up to | ||
50 | * (t + t/g) seconds, where t is the inactive element lifetime and g is the | ||
51 | * number of groups. | ||
52 | * | ||
53 | * The above example assumes that the reaper function gets called at least once | ||
54 | * every (t/g) seconds. If it is called less frequently, unused elements will | ||
55 | * accumulate in the reap list until the reaper function is eventually called. | ||
56 | * The current implementation uses work queue callbacks to carefully time the | ||
57 | * reaper function calls, so this should happen rarely, if at all. | ||
58 | * | ||
59 | * From a design perspective, the primary reason for the choice of a list array | ||
60 | * representing discrete time intervals is that it's only practical to reap | ||
61 | * expired elements in groups of some appreciable size. This automatically | ||
62 | * introduces a granularity to element lifetimes, so there's no point storing an | ||
63 | * individual timeout with each element that specifies a more precise reap time. | ||
64 | * The bonus is a saving of sizeof(long) bytes of memory per element stored. | ||
65 | * | ||
66 | * The elements could have been stored in just one list, but an array of | ||
67 | * counters or pointers would need to be maintained to allow them to be divided | ||
68 | * up into discrete time groups. More critically, the process of touching or | ||
69 | * removing an element would involve walking large portions of the entire list, | ||
70 | * which would have a detrimental effect on performance. The additional memory | ||
71 | * requirement for the array of list heads is minimal. | ||
72 | * | ||
73 | * When an element is touched or deleted, it needs to be removed from its | ||
74 | * current list. Doubly linked lists are used to make the list maintenance | ||
75 | * portion of these operations O(1). Since reaper timing can be imprecise, | ||
76 | * inserts and lookups can occur when there are no free lists available. When | ||
77 | * this happens, all the elements on the LRU list need to be migrated to the end | ||
78 | * of the reap list. To keep the list maintenance portion of these operations | ||
79 | * O(1) also, list tails need to be accessible without walking the entire list. | ||
80 | * This is the reason why doubly linked list heads are used. | ||
81 | */ | ||
82 | |||
83 | /* | ||
84 | * An MRU Cache is a dynamic data structure that stores its elements in a way | ||
85 | * that allows efficient lookups, but also groups them into discrete time | ||
86 | * intervals based on insertion time. This allows elements to be efficiently | ||
87 | * and automatically reaped after a fixed period of inactivity. | ||
88 | * | ||
89 | * When a client data pointer is stored in the MRU Cache it needs to be added to | ||
90 | * both the data store and to one of the lists. It must also be possible to | ||
91 | * access each of these entries via the other, i.e. to: | ||
92 | * | ||
93 | * a) Walk a list, removing the corresponding data store entry for each item. | ||
94 | * b) Look up a data store entry, then access its list entry directly. | ||
95 | * | ||
96 | * To achieve both of these goals, each entry must contain both a list entry and | ||
97 | * a key, in addition to the user's data pointer. Note that it's not a good | ||
98 | * idea to have the client embed one of these structures at the top of their own | ||
99 | * data structure, because inserting the same item more than once would most | ||
100 | * likely result in a loop in one of the lists. That's a sure-fire recipe for | ||
101 | * an infinite loop in the code. | ||
102 | */ | ||
103 | typedef struct xfs_mru_cache_elem | ||
104 | { | ||
105 | struct list_head list_node; | ||
106 | unsigned long key; | ||
107 | void *value; | ||
108 | } xfs_mru_cache_elem_t; | ||
109 | |||
110 | static kmem_zone_t *xfs_mru_elem_zone; | ||
111 | static struct workqueue_struct *xfs_mru_reap_wq; | ||
112 | |||
113 | /* | ||
114 | * When inserting, destroying or reaping, it's first necessary to update the | ||
115 | * lists relative to a particular time. In the case of destroying, that time | ||
116 | * will be well in the future to ensure that all items are moved to the reap | ||
117 | * list. In all other cases though, the time will be the current time. | ||
118 | * | ||
119 | * This function enters a loop, moving the contents of the LRU list to the reap | ||
120 | * list again and again until either a) the lists are all empty, or b) time zero | ||
121 | * has been advanced sufficiently to be within the immediate element lifetime. | ||
122 | * | ||
123 | * Case a) above is detected by counting how many groups are migrated and | ||
124 | * stopping when they've all been moved. Case b) is detected by monitoring the | ||
125 | * time_zero field, which is updated as each group is migrated. | ||
126 | * | ||
127 | * The return value is the earliest time that more migration could be needed, or | ||
128 | * zero if there's no need to schedule more work because the lists are empty. | ||
129 | */ | ||
130 | STATIC unsigned long | ||
131 | _xfs_mru_cache_migrate( | ||
132 | xfs_mru_cache_t *mru, | ||
133 | unsigned long now) | ||
134 | { | ||
135 | unsigned int grp; | ||
136 | unsigned int migrated = 0; | ||
137 | struct list_head *lru_list; | ||
138 | |||
139 | /* Nothing to do if the data store is empty. */ | ||
140 | if (!mru->time_zero) | ||
141 | return 0; | ||
142 | |||
143 | /* While time zero is older than the time spanned by all the lists. */ | ||
144 | while (mru->time_zero <= now - mru->grp_count * mru->grp_time) { | ||
145 | |||
146 | /* | ||
147 | * If the LRU list isn't empty, migrate its elements to the tail | ||
148 | * of the reap list. | ||
149 | */ | ||
150 | lru_list = mru->lists + mru->lru_grp; | ||
151 | if (!list_empty(lru_list)) | ||
152 | list_splice_init(lru_list, mru->reap_list.prev); | ||
153 | |||
154 | /* | ||
155 | * Advance the LRU group number, freeing the old LRU list to | ||
156 | * become the new MRU list; advance time zero accordingly. | ||
157 | */ | ||
158 | mru->lru_grp = (mru->lru_grp + 1) % mru->grp_count; | ||
159 | mru->time_zero += mru->grp_time; | ||
160 | |||
161 | /* | ||
162 | * If reaping is so far behind that all the elements on all the | ||
163 | * lists have been migrated to the reap list, it's now empty. | ||
164 | */ | ||
165 | if (++migrated == mru->grp_count) { | ||
166 | mru->lru_grp = 0; | ||
167 | mru->time_zero = 0; | ||
168 | return 0; | ||
169 | } | ||
170 | } | ||
171 | |||
172 | /* Find the first non-empty list from the LRU end. */ | ||
173 | for (grp = 0; grp < mru->grp_count; grp++) { | ||
174 | |||
175 | /* Check the grp'th list from the LRU end. */ | ||
176 | lru_list = mru->lists + ((mru->lru_grp + grp) % mru->grp_count); | ||
177 | if (!list_empty(lru_list)) | ||
178 | return mru->time_zero + | ||
179 | (mru->grp_count + grp) * mru->grp_time; | ||
180 | } | ||
181 | |||
182 | /* All the lists must be empty. */ | ||
183 | mru->lru_grp = 0; | ||
184 | mru->time_zero = 0; | ||
185 | return 0; | ||
186 | } | ||
187 | |||
188 | /* | ||
189 | * When inserting or doing a lookup, an element needs to be inserted into the | ||
190 | * MRU list. The lists must be migrated first to ensure that they're | ||
191 | * up-to-date, otherwise the new element could be given a shorter lifetime in | ||
192 | * the cache than it should. | ||
193 | */ | ||
194 | STATIC void | ||
195 | _xfs_mru_cache_list_insert( | ||
196 | xfs_mru_cache_t *mru, | ||
197 | xfs_mru_cache_elem_t *elem) | ||
198 | { | ||
199 | unsigned int grp = 0; | ||
200 | unsigned long now = jiffies; | ||
201 | |||
202 | /* | ||
203 | * If the data store is empty, initialise time zero, leave grp set to | ||
204 | * zero and start the work queue timer if necessary. Otherwise, set grp | ||
205 | * to the number of group times that have elapsed since time zero. | ||
206 | */ | ||
207 | if (!_xfs_mru_cache_migrate(mru, now)) { | ||
208 | mru->time_zero = now; | ||
209 | if (!mru->next_reap) | ||
210 | mru->next_reap = mru->grp_count * mru->grp_time; | ||
211 | } else { | ||
212 | grp = (now - mru->time_zero) / mru->grp_time; | ||
213 | grp = (mru->lru_grp + grp) % mru->grp_count; | ||
214 | } | ||
215 | |||
216 | /* Insert the element at the tail of the corresponding list. */ | ||
217 | list_add_tail(&elem->list_node, mru->lists + grp); | ||
218 | } | ||
219 | |||
220 | /* | ||
221 | * When destroying or reaping, all the elements that were migrated to the reap | ||
222 | * list need to be deleted. For each element this involves removing it from the | ||
223 | * data store, removing it from the reap list, calling the client's free | ||
224 | * function and deleting the element from the element zone. | ||
225 | */ | ||
226 | STATIC void | ||
227 | _xfs_mru_cache_clear_reap_list( | ||
228 | xfs_mru_cache_t *mru) | ||
229 | { | ||
230 | xfs_mru_cache_elem_t *elem, *next; | ||
231 | struct list_head tmp; | ||
232 | |||
233 | INIT_LIST_HEAD(&tmp); | ||
234 | list_for_each_entry_safe(elem, next, &mru->reap_list, list_node) { | ||
235 | |||
236 | /* Remove the element from the data store. */ | ||
237 | radix_tree_delete(&mru->store, elem->key); | ||
238 | |||
239 | /* | ||
240 | * remove to temp list so it can be freed without | ||
241 | * needing to hold the lock | ||
242 | */ | ||
243 | list_move(&elem->list_node, &tmp); | ||
244 | } | ||
245 | mutex_spinunlock(&mru->lock, 0); | ||
246 | |||
247 | list_for_each_entry_safe(elem, next, &tmp, list_node) { | ||
248 | |||
249 | /* Remove the element from the reap list. */ | ||
250 | list_del_init(&elem->list_node); | ||
251 | |||
252 | /* Call the client's free function with the key and value pointer. */ | ||
253 | mru->free_func(elem->key, elem->value); | ||
254 | |||
255 | /* Free the element structure. */ | ||
256 | kmem_zone_free(xfs_mru_elem_zone, elem); | ||
257 | } | ||
258 | |||
259 | mutex_spinlock(&mru->lock); | ||
260 | } | ||
261 | |||
262 | /* | ||
263 | * We fire the reap timer every group expiry interval so | ||
264 | * we always have a reaper ready to run. This makes shutdown | ||
265 | * and flushing of the reaper easy to do. Hence we need to | ||
266 | * keep when the next reap must occur so we can determine | ||
267 | * at each interval whether there is anything we need to do. | ||
268 | */ | ||
269 | STATIC void | ||
270 | _xfs_mru_cache_reap( | ||
271 | struct work_struct *work) | ||
272 | { | ||
273 | xfs_mru_cache_t *mru = container_of(work, xfs_mru_cache_t, work.work); | ||
274 | unsigned long now; | ||
275 | |||
276 | ASSERT(mru && mru->lists); | ||
277 | if (!mru || !mru->lists) | ||
278 | return; | ||
279 | |||
280 | mutex_spinlock(&mru->lock); | ||
281 | now = jiffies; | ||
282 | if (mru->reap_all || | ||
283 | (mru->next_reap && time_after(now, mru->next_reap))) { | ||
284 | if (mru->reap_all) | ||
285 | now += mru->grp_count * mru->grp_time * 2; | ||
286 | mru->next_reap = _xfs_mru_cache_migrate(mru, now); | ||
287 | _xfs_mru_cache_clear_reap_list(mru); | ||
288 | } | ||
289 | |||
290 | /* | ||
291 | * the process that triggered the reap_all is responsible | ||
292 | * for restating the periodic reap if it is required. | ||
293 | */ | ||
294 | if (!mru->reap_all) | ||
295 | queue_delayed_work(xfs_mru_reap_wq, &mru->work, mru->grp_time); | ||
296 | mru->reap_all = 0; | ||
297 | mutex_spinunlock(&mru->lock, 0); | ||
298 | } | ||
299 | |||
300 | int | ||
301 | xfs_mru_cache_init(void) | ||
302 | { | ||
303 | xfs_mru_elem_zone = kmem_zone_init(sizeof(xfs_mru_cache_elem_t), | ||
304 | "xfs_mru_cache_elem"); | ||
305 | if (!xfs_mru_elem_zone) | ||
306 | return ENOMEM; | ||
307 | |||
308 | xfs_mru_reap_wq = create_singlethread_workqueue("xfs_mru_cache"); | ||
309 | if (!xfs_mru_reap_wq) { | ||
310 | kmem_zone_destroy(xfs_mru_elem_zone); | ||
311 | return ENOMEM; | ||
312 | } | ||
313 | |||
314 | return 0; | ||
315 | } | ||
316 | |||
317 | void | ||
318 | xfs_mru_cache_uninit(void) | ||
319 | { | ||
320 | destroy_workqueue(xfs_mru_reap_wq); | ||
321 | kmem_zone_destroy(xfs_mru_elem_zone); | ||
322 | } | ||
323 | |||
324 | /* | ||
325 | * To initialise a struct xfs_mru_cache pointer, call xfs_mru_cache_create() | ||
326 | * with the address of the pointer, a lifetime value in milliseconds, a group | ||
327 | * count and a free function to use when deleting elements. This function | ||
328 | * returns 0 if the initialisation was successful. | ||
329 | */ | ||
330 | int | ||
331 | xfs_mru_cache_create( | ||
332 | xfs_mru_cache_t **mrup, | ||
333 | unsigned int lifetime_ms, | ||
334 | unsigned int grp_count, | ||
335 | xfs_mru_cache_free_func_t free_func) | ||
336 | { | ||
337 | xfs_mru_cache_t *mru = NULL; | ||
338 | int err = 0, grp; | ||
339 | unsigned int grp_time; | ||
340 | |||
341 | if (mrup) | ||
342 | *mrup = NULL; | ||
343 | |||
344 | if (!mrup || !grp_count || !lifetime_ms || !free_func) | ||
345 | return EINVAL; | ||
346 | |||
347 | if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count)) | ||
348 | return EINVAL; | ||
349 | |||
350 | if (!(mru = kmem_zalloc(sizeof(*mru), KM_SLEEP))) | ||
351 | return ENOMEM; | ||
352 | |||
353 | /* An extra list is needed to avoid reaping up to a grp_time early. */ | ||
354 | mru->grp_count = grp_count + 1; | ||
355 | mru->lists = kmem_alloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP); | ||
356 | |||
357 | if (!mru->lists) { | ||
358 | err = ENOMEM; | ||
359 | goto exit; | ||
360 | } | ||
361 | |||
362 | for (grp = 0; grp < mru->grp_count; grp++) | ||
363 | INIT_LIST_HEAD(mru->lists + grp); | ||
364 | |||
365 | /* | ||
366 | * We use GFP_KERNEL radix tree preload and do inserts under a | ||
367 | * spinlock so GFP_ATOMIC is appropriate for the radix tree itself. | ||
368 | */ | ||
369 | INIT_RADIX_TREE(&mru->store, GFP_ATOMIC); | ||
370 | INIT_LIST_HEAD(&mru->reap_list); | ||
371 | spinlock_init(&mru->lock, "xfs_mru_cache"); | ||
372 | INIT_DELAYED_WORK(&mru->work, _xfs_mru_cache_reap); | ||
373 | |||
374 | mru->grp_time = grp_time; | ||
375 | mru->free_func = free_func; | ||
376 | |||
377 | /* start up the reaper event */ | ||
378 | mru->next_reap = 0; | ||
379 | mru->reap_all = 0; | ||
380 | queue_delayed_work(xfs_mru_reap_wq, &mru->work, mru->grp_time); | ||
381 | |||
382 | *mrup = mru; | ||
383 | |||
384 | exit: | ||
385 | if (err && mru && mru->lists) | ||
386 | kmem_free(mru->lists, mru->grp_count * sizeof(*mru->lists)); | ||
387 | if (err && mru) | ||
388 | kmem_free(mru, sizeof(*mru)); | ||
389 | |||
390 | return err; | ||
391 | } | ||
392 | |||
393 | /* | ||
394 | * Call xfs_mru_cache_flush() to flush out all cached entries, calling their | ||
395 | * free functions as they're deleted. When this function returns, the caller is | ||
396 | * guaranteed that all the free functions for all the elements have finished | ||
397 | * executing. | ||
398 | * | ||
399 | * While we are flushing, we stop the periodic reaper event from triggering. | ||
400 | * Normally, we want to restart this periodic event, but if we are shutting | ||
401 | * down the cache we do not want it restarted. hence the restart parameter | ||
402 | * where 0 = do not restart reaper and 1 = restart reaper. | ||
403 | */ | ||
404 | void | ||
405 | xfs_mru_cache_flush( | ||
406 | xfs_mru_cache_t *mru, | ||
407 | int restart) | ||
408 | { | ||
409 | if (!mru || !mru->lists) | ||
410 | return; | ||
411 | |||
412 | cancel_rearming_delayed_workqueue(xfs_mru_reap_wq, &mru->work); | ||
413 | |||
414 | mutex_spinlock(&mru->lock); | ||
415 | mru->reap_all = 1; | ||
416 | mutex_spinunlock(&mru->lock, 0); | ||
417 | |||
418 | queue_work(xfs_mru_reap_wq, &mru->work.work); | ||
419 | flush_workqueue(xfs_mru_reap_wq); | ||
420 | |||
421 | mutex_spinlock(&mru->lock); | ||
422 | WARN_ON_ONCE(mru->reap_all != 0); | ||
423 | mru->reap_all = 0; | ||
424 | if (restart) | ||
425 | queue_delayed_work(xfs_mru_reap_wq, &mru->work, mru->grp_time); | ||
426 | mutex_spinunlock(&mru->lock, 0); | ||
427 | } | ||
428 | |||
429 | void | ||
430 | xfs_mru_cache_destroy( | ||
431 | xfs_mru_cache_t *mru) | ||
432 | { | ||
433 | if (!mru || !mru->lists) | ||
434 | return; | ||
435 | |||
436 | /* we don't want the reaper to restart here */ | ||
437 | xfs_mru_cache_flush(mru, 0); | ||
438 | |||
439 | kmem_free(mru->lists, mru->grp_count * sizeof(*mru->lists)); | ||
440 | kmem_free(mru, sizeof(*mru)); | ||
441 | } | ||
442 | |||
443 | /* | ||
444 | * To insert an element, call xfs_mru_cache_insert() with the data store, the | ||
445 | * element's key and the client data pointer. This function returns 0 on | ||
446 | * success or ENOMEM if memory for the data element couldn't be allocated. | ||
447 | */ | ||
448 | int | ||
449 | xfs_mru_cache_insert( | ||
450 | xfs_mru_cache_t *mru, | ||
451 | unsigned long key, | ||
452 | void *value) | ||
453 | { | ||
454 | xfs_mru_cache_elem_t *elem; | ||
455 | |||
456 | ASSERT(mru && mru->lists); | ||
457 | if (!mru || !mru->lists) | ||
458 | return EINVAL; | ||
459 | |||
460 | elem = kmem_zone_zalloc(xfs_mru_elem_zone, KM_SLEEP); | ||
461 | if (!elem) | ||
462 | return ENOMEM; | ||
463 | |||
464 | if (radix_tree_preload(GFP_KERNEL)) { | ||
465 | kmem_zone_free(xfs_mru_elem_zone, elem); | ||
466 | return ENOMEM; | ||
467 | } | ||
468 | |||
469 | INIT_LIST_HEAD(&elem->list_node); | ||
470 | elem->key = key; | ||
471 | elem->value = value; | ||
472 | |||
473 | mutex_spinlock(&mru->lock); | ||
474 | |||
475 | radix_tree_insert(&mru->store, key, elem); | ||
476 | radix_tree_preload_end(); | ||
477 | _xfs_mru_cache_list_insert(mru, elem); | ||
478 | |||
479 | mutex_spinunlock(&mru->lock, 0); | ||
480 | |||
481 | return 0; | ||
482 | } | ||
483 | |||
484 | /* | ||
485 | * To remove an element without calling the free function, call | ||
486 | * xfs_mru_cache_remove() with the data store and the element's key. On success | ||
487 | * the client data pointer for the removed element is returned, otherwise this | ||
488 | * function will return a NULL pointer. | ||
489 | */ | ||
490 | void * | ||
491 | xfs_mru_cache_remove( | ||
492 | xfs_mru_cache_t *mru, | ||
493 | unsigned long key) | ||
494 | { | ||
495 | xfs_mru_cache_elem_t *elem; | ||
496 | void *value = NULL; | ||
497 | |||
498 | ASSERT(mru && mru->lists); | ||
499 | if (!mru || !mru->lists) | ||
500 | return NULL; | ||
501 | |||
502 | mutex_spinlock(&mru->lock); | ||
503 | elem = radix_tree_delete(&mru->store, key); | ||
504 | if (elem) { | ||
505 | value = elem->value; | ||
506 | list_del(&elem->list_node); | ||
507 | } | ||
508 | |||
509 | mutex_spinunlock(&mru->lock, 0); | ||
510 | |||
511 | if (elem) | ||
512 | kmem_zone_free(xfs_mru_elem_zone, elem); | ||
513 | |||
514 | return value; | ||
515 | } | ||
516 | |||
517 | /* | ||
518 | * To remove and element and call the free function, call xfs_mru_cache_delete() | ||
519 | * with the data store and the element's key. | ||
520 | */ | ||
521 | void | ||
522 | xfs_mru_cache_delete( | ||
523 | xfs_mru_cache_t *mru, | ||
524 | unsigned long key) | ||
525 | { | ||
526 | void *value = xfs_mru_cache_remove(mru, key); | ||
527 | |||
528 | if (value) | ||
529 | mru->free_func(key, value); | ||
530 | } | ||
531 | |||
532 | /* | ||
533 | * To look up an element using its key, call xfs_mru_cache_lookup() with the | ||
534 | * data store and the element's key. If found, the element will be moved to the | ||
535 | * head of the MRU list to indicate that it's been touched. | ||
536 | * | ||
537 | * The internal data structures are protected by a spinlock that is STILL HELD | ||
538 | * when this function returns. Call xfs_mru_cache_done() to release it. Note | ||
539 | * that it is not safe to call any function that might sleep in the interim. | ||
540 | * | ||
541 | * The implementation could have used reference counting to avoid this | ||
542 | * restriction, but since most clients simply want to get, set or test a member | ||
543 | * of the returned data structure, the extra per-element memory isn't warranted. | ||
544 | * | ||
545 | * If the element isn't found, this function returns NULL and the spinlock is | ||
546 | * released. xfs_mru_cache_done() should NOT be called when this occurs. | ||
547 | */ | ||
548 | void * | ||
549 | xfs_mru_cache_lookup( | ||
550 | xfs_mru_cache_t *mru, | ||
551 | unsigned long key) | ||
552 | { | ||
553 | xfs_mru_cache_elem_t *elem; | ||
554 | |||
555 | ASSERT(mru && mru->lists); | ||
556 | if (!mru || !mru->lists) | ||
557 | return NULL; | ||
558 | |||
559 | mutex_spinlock(&mru->lock); | ||
560 | elem = radix_tree_lookup(&mru->store, key); | ||
561 | if (elem) { | ||
562 | list_del(&elem->list_node); | ||
563 | _xfs_mru_cache_list_insert(mru, elem); | ||
564 | } | ||
565 | else | ||
566 | mutex_spinunlock(&mru->lock, 0); | ||
567 | |||
568 | return elem ? elem->value : NULL; | ||
569 | } | ||
570 | |||
571 | /* | ||
572 | * To look up an element using its key, but leave its location in the internal | ||
573 | * lists alone, call xfs_mru_cache_peek(). If the element isn't found, this | ||
574 | * function returns NULL. | ||
575 | * | ||
576 | * See the comments above the declaration of the xfs_mru_cache_lookup() function | ||
577 | * for important locking information pertaining to this call. | ||
578 | */ | ||
579 | void * | ||
580 | xfs_mru_cache_peek( | ||
581 | xfs_mru_cache_t *mru, | ||
582 | unsigned long key) | ||
583 | { | ||
584 | xfs_mru_cache_elem_t *elem; | ||
585 | |||
586 | ASSERT(mru && mru->lists); | ||
587 | if (!mru || !mru->lists) | ||
588 | return NULL; | ||
589 | |||
590 | mutex_spinlock(&mru->lock); | ||
591 | elem = radix_tree_lookup(&mru->store, key); | ||
592 | if (!elem) | ||
593 | mutex_spinunlock(&mru->lock, 0); | ||
594 | |||
595 | return elem ? elem->value : NULL; | ||
596 | } | ||
597 | |||
598 | /* | ||
599 | * To release the internal data structure spinlock after having performed an | ||
600 | * xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done() | ||
601 | * with the data store pointer. | ||
602 | */ | ||
603 | void | ||
604 | xfs_mru_cache_done( | ||
605 | xfs_mru_cache_t *mru) | ||
606 | { | ||
607 | mutex_spinunlock(&mru->lock, 0); | ||
608 | } | ||
diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h new file mode 100644 index 000000000000..624fd10ee8e5 --- /dev/null +++ b/fs/xfs/xfs_mru_cache.h | |||
@@ -0,0 +1,57 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006-2007 Silicon Graphics, Inc. | ||
3 | * All Rights Reserved. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or | ||
6 | * modify it under the terms of the GNU General Public License as | ||
7 | * published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope that it would be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write the Free Software Foundation, | ||
16 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA | ||
17 | */ | ||
18 | #ifndef __XFS_MRU_CACHE_H__ | ||
19 | #define __XFS_MRU_CACHE_H__ | ||
20 | |||
21 | |||
22 | /* Function pointer type for callback to free a client's data pointer. */ | ||
23 | typedef void (*xfs_mru_cache_free_func_t)(unsigned long, void*); | ||
24 | |||
25 | typedef struct xfs_mru_cache | ||
26 | { | ||
27 | struct radix_tree_root store; /* Core storage data structure. */ | ||
28 | struct list_head *lists; /* Array of lists, one per grp. */ | ||
29 | struct list_head reap_list; /* Elements overdue for reaping. */ | ||
30 | spinlock_t lock; /* Lock to protect this struct. */ | ||
31 | unsigned int grp_count; /* Number of discrete groups. */ | ||
32 | unsigned int grp_time; /* Time period spanned by grps. */ | ||
33 | unsigned int lru_grp; /* Group containing time zero. */ | ||
34 | unsigned long time_zero; /* Time first element was added. */ | ||
35 | unsigned long next_reap; /* Time that the reaper should | ||
36 | next do something. */ | ||
37 | unsigned int reap_all; /* if set, reap all lists */ | ||
38 | xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */ | ||
39 | struct delayed_work work; /* Workqueue data for reaping. */ | ||
40 | } xfs_mru_cache_t; | ||
41 | |||
42 | int xfs_mru_cache_init(void); | ||
43 | void xfs_mru_cache_uninit(void); | ||
44 | int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms, | ||
45 | unsigned int grp_count, | ||
46 | xfs_mru_cache_free_func_t free_func); | ||
47 | void xfs_mru_cache_flush(xfs_mru_cache_t *mru, int restart); | ||
48 | void xfs_mru_cache_destroy(struct xfs_mru_cache *mru); | ||
49 | int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key, | ||
50 | void *value); | ||
51 | void * xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key); | ||
52 | void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key); | ||
53 | void *xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key); | ||
54 | void *xfs_mru_cache_peek(struct xfs_mru_cache *mru, unsigned long key); | ||
55 | void xfs_mru_cache_done(struct xfs_mru_cache *mru); | ||
56 | |||
57 | #endif /* __XFS_MRU_CACHE_H__ */ | ||
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c index c343fde10ef9..11f5ea29a038 100644 --- a/fs/xfs/xfs_vfsops.c +++ b/fs/xfs/xfs_vfsops.c | |||
@@ -51,6 +51,8 @@ | |||
51 | #include "xfs_acl.h" | 51 | #include "xfs_acl.h" |
52 | #include "xfs_attr.h" | 52 | #include "xfs_attr.h" |
53 | #include "xfs_clnt.h" | 53 | #include "xfs_clnt.h" |
54 | #include "xfs_mru_cache.h" | ||
55 | #include "xfs_filestream.h" | ||
54 | #include "xfs_fsops.h" | 56 | #include "xfs_fsops.h" |
55 | 57 | ||
56 | STATIC int xfs_sync(bhv_desc_t *, int, cred_t *); | 58 | STATIC int xfs_sync(bhv_desc_t *, int, cred_t *); |
@@ -81,6 +83,8 @@ xfs_init(void) | |||
81 | xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf"); | 83 | xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf"); |
82 | xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork"); | 84 | xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork"); |
83 | xfs_acl_zone_init(xfs_acl_zone, "xfs_acl"); | 85 | xfs_acl_zone_init(xfs_acl_zone, "xfs_acl"); |
86 | xfs_mru_cache_init(); | ||
87 | xfs_filestream_init(); | ||
84 | 88 | ||
85 | /* | 89 | /* |
86 | * The size of the zone allocated buf log item is the maximum | 90 | * The size of the zone allocated buf log item is the maximum |
@@ -164,6 +168,8 @@ xfs_cleanup(void) | |||
164 | xfs_cleanup_procfs(); | 168 | xfs_cleanup_procfs(); |
165 | xfs_sysctl_unregister(); | 169 | xfs_sysctl_unregister(); |
166 | xfs_refcache_destroy(); | 170 | xfs_refcache_destroy(); |
171 | xfs_filestream_uninit(); | ||
172 | xfs_mru_cache_uninit(); | ||
167 | xfs_acl_zone_destroy(xfs_acl_zone); | 173 | xfs_acl_zone_destroy(xfs_acl_zone); |
168 | 174 | ||
169 | #ifdef XFS_DIR2_TRACE | 175 | #ifdef XFS_DIR2_TRACE |
@@ -320,6 +326,9 @@ xfs_start_flags( | |||
320 | else | 326 | else |
321 | mp->m_flags &= ~XFS_MOUNT_BARRIER; | 327 | mp->m_flags &= ~XFS_MOUNT_BARRIER; |
322 | 328 | ||
329 | if (ap->flags2 & XFSMNT2_FILESTREAMS) | ||
330 | mp->m_flags |= XFS_MOUNT_FILESTREAMS; | ||
331 | |||
323 | return 0; | 332 | return 0; |
324 | } | 333 | } |
325 | 334 | ||
@@ -518,6 +527,9 @@ xfs_mount( | |||
518 | if (mp->m_flags & XFS_MOUNT_BARRIER) | 527 | if (mp->m_flags & XFS_MOUNT_BARRIER) |
519 | xfs_mountfs_check_barriers(mp); | 528 | xfs_mountfs_check_barriers(mp); |
520 | 529 | ||
530 | if ((error = xfs_filestream_mount(mp))) | ||
531 | goto error2; | ||
532 | |||
521 | error = XFS_IOINIT(vfsp, args, flags); | 533 | error = XFS_IOINIT(vfsp, args, flags); |
522 | if (error) | 534 | if (error) |
523 | goto error2; | 535 | goto error2; |
@@ -575,6 +587,13 @@ xfs_unmount( | |||
575 | */ | 587 | */ |
576 | xfs_refcache_purge_mp(mp); | 588 | xfs_refcache_purge_mp(mp); |
577 | 589 | ||
590 | /* | ||
591 | * Blow away any referenced inode in the filestreams cache. | ||
592 | * This can and will cause log traffic as inodes go inactive | ||
593 | * here. | ||
594 | */ | ||
595 | xfs_filestream_unmount(mp); | ||
596 | |||
578 | XFS_bflush(mp->m_ddev_targp); | 597 | XFS_bflush(mp->m_ddev_targp); |
579 | error = xfs_unmount_flush(mp, 0); | 598 | error = xfs_unmount_flush(mp, 0); |
580 | if (error) | 599 | if (error) |
@@ -694,6 +713,7 @@ xfs_mntupdate( | |||
694 | mp->m_flags &= ~XFS_MOUNT_BARRIER; | 713 | mp->m_flags &= ~XFS_MOUNT_BARRIER; |
695 | } | 714 | } |
696 | } else if (!(vfsp->vfs_flag & VFS_RDONLY)) { /* rw -> ro */ | 715 | } else if (!(vfsp->vfs_flag & VFS_RDONLY)) { /* rw -> ro */ |
716 | xfs_filestream_flush(mp); | ||
697 | bhv_vfs_sync(vfsp, SYNC_DATA_QUIESCE, NULL); | 717 | bhv_vfs_sync(vfsp, SYNC_DATA_QUIESCE, NULL); |
698 | xfs_attr_quiesce(mp); | 718 | xfs_attr_quiesce(mp); |
699 | vfsp->vfs_flag |= VFS_RDONLY; | 719 | vfsp->vfs_flag |= VFS_RDONLY; |
@@ -909,6 +929,9 @@ xfs_sync( | |||
909 | { | 929 | { |
910 | xfs_mount_t *mp = XFS_BHVTOM(bdp); | 930 | xfs_mount_t *mp = XFS_BHVTOM(bdp); |
911 | 931 | ||
932 | if (flags & SYNC_IOWAIT) | ||
933 | xfs_filestream_flush(mp); | ||
934 | |||
912 | return xfs_syncsub(mp, flags, NULL); | 935 | return xfs_syncsub(mp, flags, NULL); |
913 | } | 936 | } |
914 | 937 | ||
@@ -1659,6 +1682,7 @@ xfs_vget( | |||
1659 | * in stat(). */ | 1682 | * in stat(). */ |
1660 | #define MNTOPT_ATTR2 "attr2" /* do use attr2 attribute format */ | 1683 | #define MNTOPT_ATTR2 "attr2" /* do use attr2 attribute format */ |
1661 | #define MNTOPT_NOATTR2 "noattr2" /* do not use attr2 attribute format */ | 1684 | #define MNTOPT_NOATTR2 "noattr2" /* do not use attr2 attribute format */ |
1685 | #define MNTOPT_FILESTREAM "filestreams" /* use filestreams allocator */ | ||
1662 | 1686 | ||
1663 | STATIC unsigned long | 1687 | STATIC unsigned long |
1664 | suffix_strtoul(char *s, char **endp, unsigned int base) | 1688 | suffix_strtoul(char *s, char **endp, unsigned int base) |
@@ -1845,6 +1869,8 @@ xfs_parseargs( | |||
1845 | args->flags |= XFSMNT_ATTR2; | 1869 | args->flags |= XFSMNT_ATTR2; |
1846 | } else if (!strcmp(this_char, MNTOPT_NOATTR2)) { | 1870 | } else if (!strcmp(this_char, MNTOPT_NOATTR2)) { |
1847 | args->flags &= ~XFSMNT_ATTR2; | 1871 | args->flags &= ~XFSMNT_ATTR2; |
1872 | } else if (!strcmp(this_char, MNTOPT_FILESTREAM)) { | ||
1873 | args->flags2 |= XFSMNT2_FILESTREAMS; | ||
1848 | } else if (!strcmp(this_char, "osyncisdsync")) { | 1874 | } else if (!strcmp(this_char, "osyncisdsync")) { |
1849 | /* no-op, this is now the default */ | 1875 | /* no-op, this is now the default */ |
1850 | cmn_err(CE_WARN, | 1876 | cmn_err(CE_WARN, |
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 2067d0b0a10e..60fd0be90a16 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c | |||
@@ -51,6 +51,7 @@ | |||
51 | #include "xfs_refcache.h" | 51 | #include "xfs_refcache.h" |
52 | #include "xfs_trans_space.h" | 52 | #include "xfs_trans_space.h" |
53 | #include "xfs_log_priv.h" | 53 | #include "xfs_log_priv.h" |
54 | #include "xfs_filestream.h" | ||
54 | 55 | ||
55 | STATIC int | 56 | STATIC int |
56 | xfs_open( | 57 | xfs_open( |
@@ -783,6 +784,8 @@ xfs_setattr( | |||
783 | di_flags |= XFS_DIFLAG_PROJINHERIT; | 784 | di_flags |= XFS_DIFLAG_PROJINHERIT; |
784 | if (vap->va_xflags & XFS_XFLAG_NODEFRAG) | 785 | if (vap->va_xflags & XFS_XFLAG_NODEFRAG) |
785 | di_flags |= XFS_DIFLAG_NODEFRAG; | 786 | di_flags |= XFS_DIFLAG_NODEFRAG; |
787 | if (vap->va_xflags & XFS_XFLAG_FILESTREAM) | ||
788 | di_flags |= XFS_DIFLAG_FILESTREAM; | ||
786 | if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { | 789 | if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { |
787 | if (vap->va_xflags & XFS_XFLAG_RTINHERIT) | 790 | if (vap->va_xflags & XFS_XFLAG_RTINHERIT) |
788 | di_flags |= XFS_DIFLAG_RTINHERIT; | 791 | di_flags |= XFS_DIFLAG_RTINHERIT; |
@@ -1536,7 +1539,17 @@ xfs_release( | |||
1536 | if (vp->v_vfsp->vfs_flag & VFS_RDONLY) | 1539 | if (vp->v_vfsp->vfs_flag & VFS_RDONLY) |
1537 | return 0; | 1540 | return 0; |
1538 | 1541 | ||
1539 | if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { | 1542 | if (!XFS_FORCED_SHUTDOWN(mp)) { |
1543 | /* | ||
1544 | * If we are using filestreams, and we have an unlinked | ||
1545 | * file that we are processing the last close on, then nothing | ||
1546 | * will be able to reopen and write to this file. Purge this | ||
1547 | * inode from the filestreams cache so that it doesn't delay | ||
1548 | * teardown of the inode. | ||
1549 | */ | ||
1550 | if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip)) | ||
1551 | xfs_filestream_deassociate(ip); | ||
1552 | |||
1540 | /* | 1553 | /* |
1541 | * If we previously truncated this file and removed old data | 1554 | * If we previously truncated this file and removed old data |
1542 | * in the process, we want to initiate "early" writeout on | 1555 | * in the process, we want to initiate "early" writeout on |
@@ -1551,7 +1564,6 @@ xfs_release( | |||
1551 | bhv_vop_flush_pages(vp, 0, -1, XFS_B_ASYNC, FI_NONE); | 1564 | bhv_vop_flush_pages(vp, 0, -1, XFS_B_ASYNC, FI_NONE); |
1552 | } | 1565 | } |
1553 | 1566 | ||
1554 | |||
1555 | #ifdef HAVE_REFCACHE | 1567 | #ifdef HAVE_REFCACHE |
1556 | /* If we are in the NFS reference cache then don't do this now */ | 1568 | /* If we are in the NFS reference cache then don't do this now */ |
1557 | if (ip->i_refcache) | 1569 | if (ip->i_refcache) |
@@ -2541,6 +2553,15 @@ xfs_remove( | |||
2541 | */ | 2553 | */ |
2542 | xfs_refcache_purge_ip(ip); | 2554 | xfs_refcache_purge_ip(ip); |
2543 | 2555 | ||
2556 | /* | ||
2557 | * If we are using filestreams, kill the stream association. | ||
2558 | * If the file is still open it may get a new one but that | ||
2559 | * will get killed on last close in xfs_close() so we don't | ||
2560 | * have to worry about that. | ||
2561 | */ | ||
2562 | if (link_zero && xfs_inode_is_filestream(ip)) | ||
2563 | xfs_filestream_deassociate(ip); | ||
2564 | |||
2544 | vn_trace_exit(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address); | 2565 | vn_trace_exit(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address); |
2545 | 2566 | ||
2546 | /* | 2567 | /* |