aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/xfs/Makefile-linux-2.62
-rw-r--r--fs/xfs/linux-2.6/xfs_globals.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_sysctl.c11
-rw-r--r--fs/xfs/linux-2.6/xfs_sysctl.h2
-rw-r--r--fs/xfs/xfs.h1
-rw-r--r--fs/xfs/xfs_ag.h1
-rw-r--r--fs/xfs/xfs_bmap.c69
-rw-r--r--fs/xfs/xfs_clnt.h2
-rw-r--r--fs/xfs/xfs_dinode.h4
-rw-r--r--fs/xfs/xfs_filestream.c771
-rw-r--r--fs/xfs/xfs_filestream.h136
-rw-r--r--fs/xfs/xfs_fs.h1
-rw-r--r--fs/xfs/xfs_fsops.c2
-rw-r--r--fs/xfs/xfs_inode.c17
-rw-r--r--fs/xfs/xfs_inode.h1
-rw-r--r--fs/xfs/xfs_mount.h4
-rw-r--r--fs/xfs/xfs_mru_cache.c608
-rw-r--r--fs/xfs/xfs_mru_cache.h57
-rw-r--r--fs/xfs/xfs_vfsops.c26
-rw-r--r--fs/xfs/xfs_vnodeops.c25
21 files changed, 1730 insertions, 12 deletions
diff --git a/fs/xfs/Makefile-linux-2.6 b/fs/xfs/Makefile-linux-2.6
index b49989bb89ad..e7a9a83f0087 100644
--- a/fs/xfs/Makefile-linux-2.6
+++ b/fs/xfs/Makefile-linux-2.6
@@ -64,6 +64,7 @@ xfs-y += xfs_alloc.o \
64 xfs_dir2_sf.o \ 64 xfs_dir2_sf.o \
65 xfs_error.o \ 65 xfs_error.o \
66 xfs_extfree_item.o \ 66 xfs_extfree_item.o \
67 xfs_filestream.o \
67 xfs_fsops.o \ 68 xfs_fsops.o \
68 xfs_ialloc.o \ 69 xfs_ialloc.o \
69 xfs_ialloc_btree.o \ 70 xfs_ialloc_btree.o \
@@ -77,6 +78,7 @@ xfs-y += xfs_alloc.o \
77 xfs_log.o \ 78 xfs_log.o \
78 xfs_log_recover.o \ 79 xfs_log_recover.o \
79 xfs_mount.o \ 80 xfs_mount.o \
81 xfs_mru_cache.o \
80 xfs_rename.o \ 82 xfs_rename.o \
81 xfs_trans.o \ 83 xfs_trans.o \
82 xfs_trans_ail.o \ 84 xfs_trans_ail.o \
diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c
index ed3a5e1b4b67..bb72c3d4141f 100644
--- a/fs/xfs/linux-2.6/xfs_globals.c
+++ b/fs/xfs/linux-2.6/xfs_globals.c
@@ -46,6 +46,7 @@ xfs_param_t xfs_params = {
46 .inherit_nosym = { 0, 0, 1 }, 46 .inherit_nosym = { 0, 0, 1 },
47 .rotorstep = { 1, 1, 255 }, 47 .rotorstep = { 1, 1, 255 },
48 .inherit_nodfrg = { 0, 1, 1 }, 48 .inherit_nodfrg = { 0, 1, 1 },
49 .fstrm_timer = { 1, 50, 3600*100},
49}; 50};
50 51
51/* 52/*
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index af24a457d3a3..330c4ba9d404 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -123,6 +123,7 @@
123#define xfs_inherit_nosymlinks xfs_params.inherit_nosym.val 123#define xfs_inherit_nosymlinks xfs_params.inherit_nosym.val
124#define xfs_rotorstep xfs_params.rotorstep.val 124#define xfs_rotorstep xfs_params.rotorstep.val
125#define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val 125#define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val
126#define xfs_fstrm_centisecs xfs_params.fstrm_timer.val
126 127
127#define current_cpu() (raw_smp_processor_id()) 128#define current_cpu() (raw_smp_processor_id())
128#define current_pid() (current->pid) 129#define current_pid() (current->pid)
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index cd6eaa44aa2b..bb997d75c05c 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -210,6 +210,17 @@ static ctl_table xfs_table[] = {
210 .extra1 = &xfs_params.inherit_nodfrg.min, 210 .extra1 = &xfs_params.inherit_nodfrg.min,
211 .extra2 = &xfs_params.inherit_nodfrg.max 211 .extra2 = &xfs_params.inherit_nodfrg.max
212 }, 212 },
213 {
214 .ctl_name = XFS_FILESTREAM_TIMER,
215 .procname = "filestream_centisecs",
216 .data = &xfs_params.fstrm_timer.val,
217 .maxlen = sizeof(int),
218 .mode = 0644,
219 .proc_handler = &proc_dointvec_minmax,
220 .strategy = &sysctl_intvec,
221 .extra1 = &xfs_params.fstrm_timer.min,
222 .extra2 = &xfs_params.fstrm_timer.max,
223 },
213 /* please keep this the last entry */ 224 /* please keep this the last entry */
214#ifdef CONFIG_PROC_FS 225#ifdef CONFIG_PROC_FS
215 { 226 {
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.h b/fs/xfs/linux-2.6/xfs_sysctl.h
index a631fb8cc5ac..98b97e399d6f 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.h
+++ b/fs/xfs/linux-2.6/xfs_sysctl.h
@@ -47,6 +47,7 @@ typedef struct xfs_param {
47 xfs_sysctl_val_t inherit_nosym; /* Inherit the "nosymlinks" flag. */ 47 xfs_sysctl_val_t inherit_nosym; /* Inherit the "nosymlinks" flag. */
48 xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */ 48 xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */
49 xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */ 49 xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */
50 xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */
50} xfs_param_t; 51} xfs_param_t;
51 52
52/* 53/*
@@ -86,6 +87,7 @@ enum {
86 XFS_INHERIT_NOSYM = 19, 87 XFS_INHERIT_NOSYM = 19,
87 XFS_ROTORSTEP = 20, 88 XFS_ROTORSTEP = 20,
88 XFS_INHERIT_NODFRG = 21, 89 XFS_INHERIT_NODFRG = 21,
90 XFS_FILESTREAM_TIMER = 22,
89}; 91};
90 92
91extern xfs_param_t xfs_params; 93extern xfs_param_t xfs_params;
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
index bf0a12040b13..b5a7d92c6843 100644
--- a/fs/xfs/xfs.h
+++ b/fs/xfs/xfs.h
@@ -38,6 +38,7 @@
38#define XFS_RW_TRACE 1 38#define XFS_RW_TRACE 1
39#define XFS_BUF_TRACE 1 39#define XFS_BUF_TRACE 1
40#define XFS_VNODE_TRACE 1 40#define XFS_VNODE_TRACE 1
41#define XFS_FILESTREAMS_TRACE 1
41#endif 42#endif
42 43
43#include <linux-2.6/xfs_linux.h> 44#include <linux-2.6/xfs_linux.h>
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index b1dd0029c60e..51c09c114a20 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -196,6 +196,7 @@ typedef struct xfs_perag
196 lock_t pagb_lock; /* lock for pagb_list */ 196 lock_t pagb_lock; /* lock for pagb_list */
197#endif 197#endif
198 xfs_perag_busy_t *pagb_list; /* unstable blocks */ 198 xfs_perag_busy_t *pagb_list; /* unstable blocks */
199 atomic_t pagf_fstrms; /* # of filestreams active in this AG */
199} xfs_perag_t; 200} xfs_perag_t;
200 201
201#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels) 202#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels)
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 09d86388bb71..51ba689a4552 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -52,6 +52,7 @@
52#include "xfs_quota.h" 52#include "xfs_quota.h"
53#include "xfs_trans_space.h" 53#include "xfs_trans_space.h"
54#include "xfs_buf_item.h" 54#include "xfs_buf_item.h"
55#include "xfs_filestream.h"
55 56
56 57
57#ifdef DEBUG 58#ifdef DEBUG
@@ -2725,9 +2726,15 @@ xfs_bmap_btalloc(
2725 } 2726 }
2726 nullfb = ap->firstblock == NULLFSBLOCK; 2727 nullfb = ap->firstblock == NULLFSBLOCK;
2727 fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock); 2728 fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock);
2728 if (nullfb) 2729 if (nullfb) {
2729 ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino); 2730 if (ap->userdata && xfs_inode_is_filestream(ap->ip)) {
2730 else 2731 ag = xfs_filestream_lookup_ag(ap->ip);
2732 ag = (ag != NULLAGNUMBER) ? ag : 0;
2733 ap->rval = XFS_AGB_TO_FSB(mp, ag, 0);
2734 } else {
2735 ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
2736 }
2737 } else
2731 ap->rval = ap->firstblock; 2738 ap->rval = ap->firstblock;
2732 2739
2733 xfs_bmap_adjacent(ap); 2740 xfs_bmap_adjacent(ap);
@@ -2751,13 +2758,22 @@ xfs_bmap_btalloc(
2751 args.firstblock = ap->firstblock; 2758 args.firstblock = ap->firstblock;
2752 blen = 0; 2759 blen = 0;
2753 if (nullfb) { 2760 if (nullfb) {
2754 args.type = XFS_ALLOCTYPE_START_BNO; 2761 if (ap->userdata && xfs_inode_is_filestream(ap->ip))
2762 args.type = XFS_ALLOCTYPE_NEAR_BNO;
2763 else
2764 args.type = XFS_ALLOCTYPE_START_BNO;
2755 args.total = ap->total; 2765 args.total = ap->total;
2766
2756 /* 2767 /*
2757 * Find the longest available space. 2768 * Search for an allocation group with a single extent
2758 * We're going to try for the whole allocation at once. 2769 * large enough for the request.
2770 *
2771 * If one isn't found, then adjust the minimum allocation
2772 * size to the largest space found.
2759 */ 2773 */
2760 startag = ag = XFS_FSB_TO_AGNO(mp, args.fsbno); 2774 startag = ag = XFS_FSB_TO_AGNO(mp, args.fsbno);
2775 if (startag == NULLAGNUMBER)
2776 startag = ag = 0;
2761 notinit = 0; 2777 notinit = 0;
2762 down_read(&mp->m_peraglock); 2778 down_read(&mp->m_peraglock);
2763 while (blen < ap->alen) { 2779 while (blen < ap->alen) {
@@ -2783,6 +2799,35 @@ xfs_bmap_btalloc(
2783 blen = longest; 2799 blen = longest;
2784 } else 2800 } else
2785 notinit = 1; 2801 notinit = 1;
2802
2803 if (xfs_inode_is_filestream(ap->ip)) {
2804 if (blen >= ap->alen)
2805 break;
2806
2807 if (ap->userdata) {
2808 /*
2809 * If startag is an invalid AG, we've
2810 * come here once before and
2811 * xfs_filestream_new_ag picked the
2812 * best currently available.
2813 *
2814 * Don't continue looping, since we
2815 * could loop forever.
2816 */
2817 if (startag == NULLAGNUMBER)
2818 break;
2819
2820 error = xfs_filestream_new_ag(ap, &ag);
2821 if (error) {
2822 up_read(&mp->m_peraglock);
2823 return error;
2824 }
2825
2826 /* loop again to set 'blen'*/
2827 startag = NULLAGNUMBER;
2828 continue;
2829 }
2830 }
2786 if (++ag == mp->m_sb.sb_agcount) 2831 if (++ag == mp->m_sb.sb_agcount)
2787 ag = 0; 2832 ag = 0;
2788 if (ag == startag) 2833 if (ag == startag)
@@ -2807,8 +2852,18 @@ xfs_bmap_btalloc(
2807 */ 2852 */
2808 else 2853 else
2809 args.minlen = ap->alen; 2854 args.minlen = ap->alen;
2855
2856 /*
2857 * set the failure fallback case to look in the selected
2858 * AG as the stream may have moved.
2859 */
2860 if (xfs_inode_is_filestream(ap->ip))
2861 ap->rval = args.fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
2810 } else if (ap->low) { 2862 } else if (ap->low) {
2811 args.type = XFS_ALLOCTYPE_START_BNO; 2863 if (xfs_inode_is_filestream(ap->ip))
2864 args.type = XFS_ALLOCTYPE_FIRST_AG;
2865 else
2866 args.type = XFS_ALLOCTYPE_START_BNO;
2812 args.total = args.minlen = ap->minlen; 2867 args.total = args.minlen = ap->minlen;
2813 } else { 2868 } else {
2814 args.type = XFS_ALLOCTYPE_NEAR_BNO; 2869 args.type = XFS_ALLOCTYPE_NEAR_BNO;
diff --git a/fs/xfs/xfs_clnt.h b/fs/xfs/xfs_clnt.h
index 5b7eb81453be..f89196cb08d2 100644
--- a/fs/xfs/xfs_clnt.h
+++ b/fs/xfs/xfs_clnt.h
@@ -99,5 +99,7 @@ struct xfs_mount_args {
99 */ 99 */
100#define XFSMNT2_COMPAT_IOSIZE 0x00000001 /* don't report large preferred 100#define XFSMNT2_COMPAT_IOSIZE 0x00000001 /* don't report large preferred
101 * I/O size in stat(2) */ 101 * I/O size in stat(2) */
102#define XFSMNT2_FILESTREAMS 0x00000002 /* enable the filestreams
103 * allocator */
102 104
103#endif /* __XFS_CLNT_H__ */ 105#endif /* __XFS_CLNT_H__ */
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index b33826961c45..fefd0116bac9 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -257,6 +257,7 @@ typedef enum xfs_dinode_fmt
257#define XFS_DIFLAG_EXTSIZE_BIT 11 /* inode extent size allocator hint */ 257#define XFS_DIFLAG_EXTSIZE_BIT 11 /* inode extent size allocator hint */
258#define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */ 258#define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */
259#define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */ 259#define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */
260#define XFS_DIFLAG_FILESTREAM_BIT 14 /* use filestream allocator */
260#define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT) 261#define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT)
261#define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT) 262#define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT)
262#define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT) 263#define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT)
@@ -271,12 +272,13 @@ typedef enum xfs_dinode_fmt
271#define XFS_DIFLAG_EXTSIZE (1 << XFS_DIFLAG_EXTSIZE_BIT) 272#define XFS_DIFLAG_EXTSIZE (1 << XFS_DIFLAG_EXTSIZE_BIT)
272#define XFS_DIFLAG_EXTSZINHERIT (1 << XFS_DIFLAG_EXTSZINHERIT_BIT) 273#define XFS_DIFLAG_EXTSZINHERIT (1 << XFS_DIFLAG_EXTSZINHERIT_BIT)
273#define XFS_DIFLAG_NODEFRAG (1 << XFS_DIFLAG_NODEFRAG_BIT) 274#define XFS_DIFLAG_NODEFRAG (1 << XFS_DIFLAG_NODEFRAG_BIT)
275#define XFS_DIFLAG_FILESTREAM (1 << XFS_DIFLAG_FILESTREAM_BIT)
274 276
275#define XFS_DIFLAG_ANY \ 277#define XFS_DIFLAG_ANY \
276 (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \ 278 (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \
277 XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \ 279 XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \
278 XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \ 280 XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \
279 XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \ 281 XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \
280 XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG) 282 XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM)
281 283
282#endif /* __XFS_DINODE_H__ */ 284#endif /* __XFS_DINODE_H__ */
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
new file mode 100644
index 000000000000..ce2278611bb7
--- /dev/null
+++ b/fs/xfs/xfs_filestream.c
@@ -0,0 +1,771 @@
1/*
2 * Copyright (c) 2006-2007 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_bmap_btree.h"
20#include "xfs_inum.h"
21#include "xfs_dir2.h"
22#include "xfs_dir2_sf.h"
23#include "xfs_attr_sf.h"
24#include "xfs_dinode.h"
25#include "xfs_inode.h"
26#include "xfs_ag.h"
27#include "xfs_dmapi.h"
28#include "xfs_log.h"
29#include "xfs_trans.h"
30#include "xfs_sb.h"
31#include "xfs_mount.h"
32#include "xfs_bmap.h"
33#include "xfs_alloc.h"
34#include "xfs_utils.h"
35#include "xfs_mru_cache.h"
36#include "xfs_filestream.h"
37
38#ifdef XFS_FILESTREAMS_TRACE
39
40ktrace_t *xfs_filestreams_trace_buf;
41
42STATIC void
43xfs_filestreams_trace(
44 xfs_mount_t *mp, /* mount point */
45 int type, /* type of trace */
46 const char *func, /* source function */
47 int line, /* source line number */
48 __psunsigned_t arg0,
49 __psunsigned_t arg1,
50 __psunsigned_t arg2,
51 __psunsigned_t arg3,
52 __psunsigned_t arg4,
53 __psunsigned_t arg5)
54{
55 ktrace_enter(xfs_filestreams_trace_buf,
56 (void *)(__psint_t)(type | (line << 16)),
57 (void *)func,
58 (void *)(__psunsigned_t)current_pid(),
59 (void *)mp,
60 (void *)(__psunsigned_t)arg0,
61 (void *)(__psunsigned_t)arg1,
62 (void *)(__psunsigned_t)arg2,
63 (void *)(__psunsigned_t)arg3,
64 (void *)(__psunsigned_t)arg4,
65 (void *)(__psunsigned_t)arg5,
66 NULL, NULL, NULL, NULL, NULL, NULL);
67}
68
69#define TRACE0(mp,t) TRACE6(mp,t,0,0,0,0,0,0)
70#define TRACE1(mp,t,a0) TRACE6(mp,t,a0,0,0,0,0,0)
71#define TRACE2(mp,t,a0,a1) TRACE6(mp,t,a0,a1,0,0,0,0)
72#define TRACE3(mp,t,a0,a1,a2) TRACE6(mp,t,a0,a1,a2,0,0,0)
73#define TRACE4(mp,t,a0,a1,a2,a3) TRACE6(mp,t,a0,a1,a2,a3,0,0)
74#define TRACE5(mp,t,a0,a1,a2,a3,a4) TRACE6(mp,t,a0,a1,a2,a3,a4,0)
75#define TRACE6(mp,t,a0,a1,a2,a3,a4,a5) \
76 xfs_filestreams_trace(mp, t, __FUNCTION__, __LINE__, \
77 (__psunsigned_t)a0, (__psunsigned_t)a1, \
78 (__psunsigned_t)a2, (__psunsigned_t)a3, \
79 (__psunsigned_t)a4, (__psunsigned_t)a5)
80
81#define TRACE_AG_SCAN(mp, ag, ag2) \
82 TRACE2(mp, XFS_FSTRM_KTRACE_AGSCAN, ag, ag2);
83#define TRACE_AG_PICK1(mp, max_ag, maxfree) \
84 TRACE2(mp, XFS_FSTRM_KTRACE_AGPICK1, max_ag, maxfree);
85#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag) \
86 TRACE6(mp, XFS_FSTRM_KTRACE_AGPICK2, ag, ag2, \
87 cnt, free, scan, flag)
88#define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2) \
89 TRACE5(mp, XFS_FSTRM_KTRACE_UPDATE, ip, ag, cnt, ag2, cnt2)
90#define TRACE_FREE(mp, ip, pip, ag, cnt) \
91 TRACE4(mp, XFS_FSTRM_KTRACE_FREE, ip, pip, ag, cnt)
92#define TRACE_LOOKUP(mp, ip, pip, ag, cnt) \
93 TRACE4(mp, XFS_FSTRM_KTRACE_ITEM_LOOKUP, ip, pip, ag, cnt)
94#define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt) \
95 TRACE4(mp, XFS_FSTRM_KTRACE_ASSOCIATE, ip, pip, ag, cnt)
96#define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt) \
97 TRACE6(mp, XFS_FSTRM_KTRACE_MOVEAG, ip, pip, oag, ocnt, nag, ncnt)
98#define TRACE_ORPHAN(mp, ip, ag) \
99 TRACE2(mp, XFS_FSTRM_KTRACE_ORPHAN, ip, ag);
100
101
102#else
103#define TRACE_AG_SCAN(mp, ag, ag2)
104#define TRACE_AG_PICK1(mp, max_ag, maxfree)
105#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag)
106#define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2)
107#define TRACE_FREE(mp, ip, pip, ag, cnt)
108#define TRACE_LOOKUP(mp, ip, pip, ag, cnt)
109#define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt)
110#define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt)
111#define TRACE_ORPHAN(mp, ip, ag)
112#endif
113
114static kmem_zone_t *item_zone;
115
116/*
117 * Structure for associating a file or a directory with an allocation group.
118 * The parent directory pointer is only needed for files, but since there will
119 * generally be vastly more files than directories in the cache, using the same
120 * data structure simplifies the code with very little memory overhead.
121 */
122typedef struct fstrm_item
123{
124 xfs_agnumber_t ag; /* AG currently in use for the file/directory. */
125 xfs_inode_t *ip; /* inode self-pointer. */
126 xfs_inode_t *pip; /* Parent directory inode pointer. */
127} fstrm_item_t;
128
129
130/*
131 * Scan the AGs starting at startag looking for an AG that isn't in use and has
132 * at least minlen blocks free.
133 */
134static int
135_xfs_filestream_pick_ag(
136 xfs_mount_t *mp,
137 xfs_agnumber_t startag,
138 xfs_agnumber_t *agp,
139 int flags,
140 xfs_extlen_t minlen)
141{
142 int err, trylock, nscan;
143 xfs_extlen_t delta, longest, need, free, minfree, maxfree = 0;
144 xfs_agnumber_t ag, max_ag = NULLAGNUMBER;
145 struct xfs_perag *pag;
146
147 /* 2% of an AG's blocks must be free for it to be chosen. */
148 minfree = mp->m_sb.sb_agblocks / 50;
149
150 ag = startag;
151 *agp = NULLAGNUMBER;
152
153 /* For the first pass, don't sleep trying to init the per-AG. */
154 trylock = XFS_ALLOC_FLAG_TRYLOCK;
155
156 for (nscan = 0; 1; nscan++) {
157
158 TRACE_AG_SCAN(mp, ag, xfs_filestream_peek_ag(mp, ag));
159
160 pag = mp->m_perag + ag;
161
162 if (!pag->pagf_init) {
163 err = xfs_alloc_pagf_init(mp, NULL, ag, trylock);
164 if (err && !trylock)
165 return err;
166 }
167
168 /* Might fail sometimes during the 1st pass with trylock set. */
169 if (!pag->pagf_init)
170 goto next_ag;
171
172 /* Keep track of the AG with the most free blocks. */
173 if (pag->pagf_freeblks > maxfree) {
174 maxfree = pag->pagf_freeblks;
175 max_ag = ag;
176 }
177
178 /*
179 * The AG reference count does two things: it enforces mutual
180 * exclusion when examining the suitability of an AG in this
181 * loop, and it guards against two filestreams being established
182 * in the same AG as each other.
183 */
184 if (xfs_filestream_get_ag(mp, ag) > 1) {
185 xfs_filestream_put_ag(mp, ag);
186 goto next_ag;
187 }
188
189 need = XFS_MIN_FREELIST_PAG(pag, mp);
190 delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0;
191 longest = (pag->pagf_longest > delta) ?
192 (pag->pagf_longest - delta) :
193 (pag->pagf_flcount > 0 || pag->pagf_longest > 0);
194
195 if (((minlen && longest >= minlen) ||
196 (!minlen && pag->pagf_freeblks >= minfree)) &&
197 (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) ||
198 (flags & XFS_PICK_LOWSPACE))) {
199
200 /* Break out, retaining the reference on the AG. */
201 free = pag->pagf_freeblks;
202 *agp = ag;
203 break;
204 }
205
206 /* Drop the reference on this AG, it's not usable. */
207 xfs_filestream_put_ag(mp, ag);
208next_ag:
209 /* Move to the next AG, wrapping to AG 0 if necessary. */
210 if (++ag >= mp->m_sb.sb_agcount)
211 ag = 0;
212
213 /* If a full pass of the AGs hasn't been done yet, continue. */
214 if (ag != startag)
215 continue;
216
217 /* Allow sleeping in xfs_alloc_pagf_init() on the 2nd pass. */
218 if (trylock != 0) {
219 trylock = 0;
220 continue;
221 }
222
223 /* Finally, if lowspace wasn't set, set it for the 3rd pass. */
224 if (!(flags & XFS_PICK_LOWSPACE)) {
225 flags |= XFS_PICK_LOWSPACE;
226 continue;
227 }
228
229 /*
230 * Take the AG with the most free space, regardless of whether
231 * it's already in use by another filestream.
232 */
233 if (max_ag != NULLAGNUMBER) {
234 xfs_filestream_get_ag(mp, max_ag);
235 TRACE_AG_PICK1(mp, max_ag, maxfree);
236 free = maxfree;
237 *agp = max_ag;
238 break;
239 }
240
241 /* take AG 0 if none matched */
242 TRACE_AG_PICK1(mp, max_ag, maxfree);
243 *agp = 0;
244 return 0;
245 }
246
247 TRACE_AG_PICK2(mp, startag, *agp, xfs_filestream_peek_ag(mp, *agp),
248 free, nscan, flags);
249
250 return 0;
251}
252
253/*
254 * Set the allocation group number for a file or a directory, updating inode
255 * references and per-AG references as appropriate. Must be called with the
256 * m_peraglock held in read mode.
257 */
258static int
259_xfs_filestream_update_ag(
260 xfs_inode_t *ip,
261 xfs_inode_t *pip,
262 xfs_agnumber_t ag)
263{
264 int err = 0;
265 xfs_mount_t *mp;
266 xfs_mru_cache_t *cache;
267 fstrm_item_t *item;
268 xfs_agnumber_t old_ag;
269 xfs_inode_t *old_pip;
270
271 /*
272 * Either ip is a regular file and pip is a directory, or ip is a
273 * directory and pip is NULL.
274 */
275 ASSERT(ip && (((ip->i_d.di_mode & S_IFREG) && pip &&
276 (pip->i_d.di_mode & S_IFDIR)) ||
277 ((ip->i_d.di_mode & S_IFDIR) && !pip)));
278
279 mp = ip->i_mount;
280 cache = mp->m_filestream;
281
282 item = xfs_mru_cache_lookup(cache, ip->i_ino);
283 if (item) {
284 ASSERT(item->ip == ip);
285 old_ag = item->ag;
286 item->ag = ag;
287 old_pip = item->pip;
288 item->pip = pip;
289 xfs_mru_cache_done(cache);
290
291 /*
292 * If the AG has changed, drop the old ref and take a new one,
293 * effectively transferring the reference from old to new AG.
294 */
295 if (ag != old_ag) {
296 xfs_filestream_put_ag(mp, old_ag);
297 xfs_filestream_get_ag(mp, ag);
298 }
299
300 /*
301 * If ip is a file and its pip has changed, drop the old ref and
302 * take a new one.
303 */
304 if (pip && pip != old_pip) {
305 IRELE(old_pip);
306 IHOLD(pip);
307 }
308
309 TRACE_UPDATE(mp, ip, old_ag, xfs_filestream_peek_ag(mp, old_ag),
310 ag, xfs_filestream_peek_ag(mp, ag));
311 return 0;
312 }
313
314 item = kmem_zone_zalloc(item_zone, KM_MAYFAIL);
315 if (!item)
316 return ENOMEM;
317
318 item->ag = ag;
319 item->ip = ip;
320 item->pip = pip;
321
322 err = xfs_mru_cache_insert(cache, ip->i_ino, item);
323 if (err) {
324 kmem_zone_free(item_zone, item);
325 return err;
326 }
327
328 /* Take a reference on the AG. */
329 xfs_filestream_get_ag(mp, ag);
330
331 /*
332 * Take a reference on the inode itself regardless of whether it's a
333 * regular file or a directory.
334 */
335 IHOLD(ip);
336
337 /*
338 * In the case of a regular file, take a reference on the parent inode
339 * as well to ensure it remains in-core.
340 */
341 if (pip)
342 IHOLD(pip);
343
344 TRACE_UPDATE(mp, ip, ag, xfs_filestream_peek_ag(mp, ag),
345 ag, xfs_filestream_peek_ag(mp, ag));
346
347 return 0;
348}
349
350/* xfs_fstrm_free_func(): callback for freeing cached stream items. */
351void
352xfs_fstrm_free_func(
353 xfs_ino_t ino,
354 fstrm_item_t *item)
355{
356 xfs_inode_t *ip = item->ip;
357 int ref;
358
359 ASSERT(ip->i_ino == ino);
360
361 xfs_iflags_clear(ip, XFS_IFILESTREAM);
362
363 /* Drop the reference taken on the AG when the item was added. */
364 ref = xfs_filestream_put_ag(ip->i_mount, item->ag);
365
366 ASSERT(ref >= 0);
367 TRACE_FREE(ip->i_mount, ip, item->pip, item->ag,
368 xfs_filestream_peek_ag(ip->i_mount, item->ag));
369
370 /*
371 * _xfs_filestream_update_ag() always takes a reference on the inode
372 * itself, whether it's a file or a directory. Release it here.
373 * This can result in the inode being freed and so we must
374 * not hold any inode locks when freeing filesstreams objects
375 * otherwise we can deadlock here.
376 */
377 IRELE(ip);
378
379 /*
380 * In the case of a regular file, _xfs_filestream_update_ag() also
381 * takes a ref on the parent inode to keep it in-core. Release that
382 * too.
383 */
384 if (item->pip)
385 IRELE(item->pip);
386
387 /* Finally, free the memory allocated for the item. */
388 kmem_zone_free(item_zone, item);
389}
390
391/*
392 * xfs_filestream_init() is called at xfs initialisation time to set up the
393 * memory zone that will be used for filestream data structure allocation.
394 */
395int
396xfs_filestream_init(void)
397{
398 item_zone = kmem_zone_init(sizeof(fstrm_item_t), "fstrm_item");
399#ifdef XFS_FILESTREAMS_TRACE
400 xfs_filestreams_trace_buf = ktrace_alloc(XFS_FSTRM_KTRACE_SIZE, KM_SLEEP);
401#endif
402 return item_zone ? 0 : -ENOMEM;
403}
404
405/*
406 * xfs_filestream_uninit() is called at xfs termination time to destroy the
407 * memory zone that was used for filestream data structure allocation.
408 */
409void
410xfs_filestream_uninit(void)
411{
412#ifdef XFS_FILESTREAMS_TRACE
413 ktrace_free(xfs_filestreams_trace_buf);
414#endif
415 kmem_zone_destroy(item_zone);
416}
417
418/*
419 * xfs_filestream_mount() is called when a file system is mounted with the
420 * filestream option. It is responsible for allocating the data structures
421 * needed to track the new file system's file streams.
422 */
423int
424xfs_filestream_mount(
425 xfs_mount_t *mp)
426{
427 int err;
428 unsigned int lifetime, grp_count;
429
430 /*
431 * The filestream timer tunable is currently fixed within the range of
432 * one second to four minutes, with five seconds being the default. The
433 * group count is somewhat arbitrary, but it'd be nice to adhere to the
434 * timer tunable to within about 10 percent. This requires at least 10
435 * groups.
436 */
437 lifetime = xfs_fstrm_centisecs * 10;
438 grp_count = 10;
439
440 err = xfs_mru_cache_create(&mp->m_filestream, lifetime, grp_count,
441 (xfs_mru_cache_free_func_t)xfs_fstrm_free_func);
442
443 return err;
444}
445
446/*
447 * xfs_filestream_unmount() is called when a file system that was mounted with
448 * the filestream option is unmounted. It drains the data structures created
449 * to track the file system's file streams and frees all the memory that was
450 * allocated.
451 */
452void
453xfs_filestream_unmount(
454 xfs_mount_t *mp)
455{
456 xfs_mru_cache_destroy(mp->m_filestream);
457}
458
459/*
460 * If the mount point's m_perag array is going to be reallocated, all
461 * outstanding cache entries must be flushed to avoid accessing reference count
462 * addresses that have been freed. The call to xfs_filestream_flush() must be
463 * made inside the block that holds the m_peraglock in write mode to do the
464 * reallocation.
465 */
466void
467xfs_filestream_flush(
468 xfs_mount_t *mp)
469{
470 /* point in time flush, so keep the reaper running */
471 xfs_mru_cache_flush(mp->m_filestream, 1);
472}
473
474/*
475 * Return the AG of the filestream the file or directory belongs to, or
476 * NULLAGNUMBER otherwise.
477 */
478xfs_agnumber_t
479xfs_filestream_lookup_ag(
480 xfs_inode_t *ip)
481{
482 xfs_mru_cache_t *cache;
483 fstrm_item_t *item;
484 xfs_agnumber_t ag;
485 int ref;
486
487 if (!(ip->i_d.di_mode & (S_IFREG | S_IFDIR))) {
488 ASSERT(0);
489 return NULLAGNUMBER;
490 }
491
492 cache = ip->i_mount->m_filestream;
493 item = xfs_mru_cache_lookup(cache, ip->i_ino);
494 if (!item) {
495 TRACE_LOOKUP(ip->i_mount, ip, NULL, NULLAGNUMBER, 0);
496 return NULLAGNUMBER;
497 }
498
499 ASSERT(ip == item->ip);
500 ag = item->ag;
501 ref = xfs_filestream_peek_ag(ip->i_mount, ag);
502 xfs_mru_cache_done(cache);
503
504 TRACE_LOOKUP(ip->i_mount, ip, item->pip, ag, ref);
505 return ag;
506}
507
508/*
509 * xfs_filestream_associate() should only be called to associate a regular file
510 * with its parent directory. Calling it with a child directory isn't
511 * appropriate because filestreams don't apply to entire directory hierarchies.
512 * Creating a file in a child directory of an existing filestream directory
513 * starts a new filestream with its own allocation group association.
514 *
515 * Returns < 0 on error, 0 if successful association occurred, > 0 if
516 * we failed to get an association because of locking issues.
517 */
518int
519xfs_filestream_associate(
520 xfs_inode_t *pip,
521 xfs_inode_t *ip)
522{
523 xfs_mount_t *mp;
524 xfs_mru_cache_t *cache;
525 fstrm_item_t *item;
526 xfs_agnumber_t ag, rotorstep, startag;
527 int err = 0;
528
529 ASSERT(pip->i_d.di_mode & S_IFDIR);
530 ASSERT(ip->i_d.di_mode & S_IFREG);
531 if (!(pip->i_d.di_mode & S_IFDIR) || !(ip->i_d.di_mode & S_IFREG))
532 return -EINVAL;
533
534 mp = pip->i_mount;
535 cache = mp->m_filestream;
536 down_read(&mp->m_peraglock);
537
538 /*
539 * We have a problem, Houston.
540 *
541 * Taking the iolock here violates inode locking order - we already
542 * hold the ilock. Hence if we block getting this lock we may never
543 * wake. Unfortunately, that means if we can't get the lock, we're
544 * screwed in terms of getting a stream association - we can't spin
545 * waiting for the lock because someone else is waiting on the lock we
546 * hold and we cannot drop that as we are in a transaction here.
547 *
548 * Lucky for us, this inversion is rarely a problem because it's a
549 * directory inode that we are trying to lock here and that means the
550 * only place that matters is xfs_sync_inodes() and SYNC_DELWRI is
551 * used. i.e. freeze, remount-ro, quotasync or unmount.
552 *
553 * So, if we can't get the iolock without sleeping then just give up
554 */
555 if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL)) {
556 up_read(&mp->m_peraglock);
557 return 1;
558 }
559
560 /* If the parent directory is already in the cache, use its AG. */
561 item = xfs_mru_cache_lookup(cache, pip->i_ino);
562 if (item) {
563 ASSERT(item->ip == pip);
564 ag = item->ag;
565 xfs_mru_cache_done(cache);
566
567 TRACE_LOOKUP(mp, pip, pip, ag, xfs_filestream_peek_ag(mp, ag));
568 err = _xfs_filestream_update_ag(ip, pip, ag);
569
570 goto exit;
571 }
572
573 /*
574 * Set the starting AG using the rotor for inode32, otherwise
575 * use the directory inode's AG.
576 */
577 if (mp->m_flags & XFS_MOUNT_32BITINODES) {
578 rotorstep = xfs_rotorstep;
579 startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount;
580 mp->m_agfrotor = (mp->m_agfrotor + 1) %
581 (mp->m_sb.sb_agcount * rotorstep);
582 } else
583 startag = XFS_INO_TO_AGNO(mp, pip->i_ino);
584
585 /* Pick a new AG for the parent inode starting at startag. */
586 err = _xfs_filestream_pick_ag(mp, startag, &ag, 0, 0);
587 if (err || ag == NULLAGNUMBER)
588 goto exit_did_pick;
589
590 /* Associate the parent inode with the AG. */
591 err = _xfs_filestream_update_ag(pip, NULL, ag);
592 if (err)
593 goto exit_did_pick;
594
595 /* Associate the file inode with the AG. */
596 err = _xfs_filestream_update_ag(ip, pip, ag);
597 if (err)
598 goto exit_did_pick;
599
600 TRACE_ASSOCIATE(mp, ip, pip, ag, xfs_filestream_peek_ag(mp, ag));
601
602exit_did_pick:
603 /*
604 * If _xfs_filestream_pick_ag() returned a valid AG, remove the
605 * reference it took on it, since the file and directory will have taken
606 * their own now if they were successfully cached.
607 */
608 if (ag != NULLAGNUMBER)
609 xfs_filestream_put_ag(mp, ag);
610
611exit:
612 xfs_iunlock(pip, XFS_IOLOCK_EXCL);
613 up_read(&mp->m_peraglock);
614 return -err;
615}
616
617/*
618 * Pick a new allocation group for the current file and its file stream. This
619 * function is called by xfs_bmap_filestreams() with the mount point's per-ag
620 * lock held.
621 */
622int
623xfs_filestream_new_ag(
624 xfs_bmalloca_t *ap,
625 xfs_agnumber_t *agp)
626{
627 int flags, err;
628 xfs_inode_t *ip, *pip = NULL;
629 xfs_mount_t *mp;
630 xfs_mru_cache_t *cache;
631 xfs_extlen_t minlen;
632 fstrm_item_t *dir, *file;
633 xfs_agnumber_t ag = NULLAGNUMBER;
634
635 ip = ap->ip;
636 mp = ip->i_mount;
637 cache = mp->m_filestream;
638 minlen = ap->alen;
639 *agp = NULLAGNUMBER;
640
641 /*
642 * Look for the file in the cache, removing it if it's found. Doing
643 * this allows it to be held across the dir lookup that follows.
644 */
645 file = xfs_mru_cache_remove(cache, ip->i_ino);
646 if (file) {
647 ASSERT(ip == file->ip);
648
649 /* Save the file's parent inode and old AG number for later. */
650 pip = file->pip;
651 ag = file->ag;
652
653 /* Look for the file's directory in the cache. */
654 dir = xfs_mru_cache_lookup(cache, pip->i_ino);
655 if (dir) {
656 ASSERT(pip == dir->ip);
657
658 /*
659 * If the directory has already moved on to a new AG,
660 * use that AG as the new AG for the file. Don't
661 * forget to twiddle the AG refcounts to match the
662 * movement.
663 */
664 if (dir->ag != file->ag) {
665 xfs_filestream_put_ag(mp, file->ag);
666 xfs_filestream_get_ag(mp, dir->ag);
667 *agp = file->ag = dir->ag;
668 }
669
670 xfs_mru_cache_done(cache);
671 }
672
673 /*
674 * Put the file back in the cache. If this fails, the free
675 * function needs to be called to tidy up in the same way as if
676 * the item had simply expired from the cache.
677 */
678 err = xfs_mru_cache_insert(cache, ip->i_ino, file);
679 if (err) {
680 xfs_fstrm_free_func(ip->i_ino, file);
681 return err;
682 }
683
684 /*
685 * If the file's AG was moved to the directory's new AG, there's
686 * nothing more to be done.
687 */
688 if (*agp != NULLAGNUMBER) {
689 TRACE_MOVEAG(mp, ip, pip,
690 ag, xfs_filestream_peek_ag(mp, ag),
691 *agp, xfs_filestream_peek_ag(mp, *agp));
692 return 0;
693 }
694 }
695
696 /*
697 * If the file's parent directory is known, take its iolock in exclusive
698 * mode to prevent two sibling files from racing each other to migrate
699 * themselves and their parent to different AGs.
700 */
701 if (pip)
702 xfs_ilock(pip, XFS_IOLOCK_EXCL);
703
704 /*
705 * A new AG needs to be found for the file. If the file's parent
706 * directory is also known, it will be moved to the new AG as well to
707 * ensure that files created inside it in future use the new AG.
708 */
709 ag = (ag == NULLAGNUMBER) ? 0 : (ag + 1) % mp->m_sb.sb_agcount;
710 flags = (ap->userdata ? XFS_PICK_USERDATA : 0) |
711 (ap->low ? XFS_PICK_LOWSPACE : 0);
712
713 err = _xfs_filestream_pick_ag(mp, ag, agp, flags, minlen);
714 if (err || *agp == NULLAGNUMBER)
715 goto exit;
716
717 /*
718 * If the file wasn't found in the file cache, then its parent directory
719 * inode isn't known. For this to have happened, the file must either
720 * be pre-existing, or it was created long enough ago that its cache
721 * entry has expired. This isn't the sort of usage that the filestreams
722 * allocator is trying to optimise, so there's no point trying to track
723 * its new AG somehow in the filestream data structures.
724 */
725 if (!pip) {
726 TRACE_ORPHAN(mp, ip, *agp);
727 goto exit;
728 }
729
730 /* Associate the parent inode with the AG. */
731 err = _xfs_filestream_update_ag(pip, NULL, *agp);
732 if (err)
733 goto exit;
734
735 /* Associate the file inode with the AG. */
736 err = _xfs_filestream_update_ag(ip, pip, *agp);
737 if (err)
738 goto exit;
739
740 TRACE_MOVEAG(mp, ip, pip, NULLAGNUMBER, 0,
741 *agp, xfs_filestream_peek_ag(mp, *agp));
742
743exit:
744 /*
745 * If _xfs_filestream_pick_ag() returned a valid AG, remove the
746 * reference it took on it, since the file and directory will have taken
747 * their own now if they were successfully cached.
748 */
749 if (*agp != NULLAGNUMBER)
750 xfs_filestream_put_ag(mp, *agp);
751 else
752 *agp = 0;
753
754 if (pip)
755 xfs_iunlock(pip, XFS_IOLOCK_EXCL);
756
757 return err;
758}
759
760/*
761 * Remove an association between an inode and a filestream object.
762 * Typically this is done on last close of an unlinked file.
763 */
764void
765xfs_filestream_deassociate(
766 xfs_inode_t *ip)
767{
768 xfs_mru_cache_t *cache = ip->i_mount->m_filestream;
769
770 xfs_mru_cache_delete(cache, ip->i_ino);
771}
diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h
new file mode 100644
index 000000000000..f655f7dc334c
--- /dev/null
+++ b/fs/xfs/xfs_filestream.h
@@ -0,0 +1,136 @@
1/*
2 * Copyright (c) 2006-2007 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_FILESTREAM_H__
19#define __XFS_FILESTREAM_H__
20
21#ifdef __KERNEL__
22
23struct xfs_mount;
24struct xfs_inode;
25struct xfs_perag;
26struct xfs_bmalloca;
27
28#ifdef XFS_FILESTREAMS_TRACE
29#define XFS_FSTRM_KTRACE_INFO 1
30#define XFS_FSTRM_KTRACE_AGSCAN 2
31#define XFS_FSTRM_KTRACE_AGPICK1 3
32#define XFS_FSTRM_KTRACE_AGPICK2 4
33#define XFS_FSTRM_KTRACE_UPDATE 5
34#define XFS_FSTRM_KTRACE_FREE 6
35#define XFS_FSTRM_KTRACE_ITEM_LOOKUP 7
36#define XFS_FSTRM_KTRACE_ASSOCIATE 8
37#define XFS_FSTRM_KTRACE_MOVEAG 9
38#define XFS_FSTRM_KTRACE_ORPHAN 10
39
40#define XFS_FSTRM_KTRACE_SIZE 16384
41extern ktrace_t *xfs_filestreams_trace_buf;
42
43#endif
44
45/*
46 * Allocation group filestream associations are tracked with per-ag atomic
47 * counters. These counters allow _xfs_filestream_pick_ag() to tell whether a
48 * particular AG already has active filestreams associated with it. The mount
49 * point's m_peraglock is used to protect these counters from per-ag array
50 * re-allocation during a growfs operation. When xfs_growfs_data_private() is
51 * about to reallocate the array, it calls xfs_filestream_flush() with the
52 * m_peraglock held in write mode.
53 *
54 * Since xfs_mru_cache_flush() guarantees that all the free functions for all
55 * the cache elements have finished executing before it returns, it's safe for
56 * the free functions to use the atomic counters without m_peraglock protection.
57 * This allows the implementation of xfs_fstrm_free_func() to be agnostic about
58 * whether it was called with the m_peraglock held in read mode, write mode or
59 * not held at all. The race condition this addresses is the following:
60 *
61 * - The work queue scheduler fires and pulls a filestream directory cache
62 * element off the LRU end of the cache for deletion, then gets pre-empted.
63 * - A growfs operation grabs the m_peraglock in write mode, flushes all the
64 * remaining items from the cache and reallocates the mount point's per-ag
65 * array, resetting all the counters to zero.
66 * - The work queue thread resumes and calls the free function for the element
67 * it started cleaning up earlier. In the process it decrements the
68 * filestreams counter for an AG that now has no references.
69 *
70 * With a shrinkfs feature, the above scenario could panic the system.
71 *
72 * All other uses of the following macros should be protected by either the
73 * m_peraglock held in read mode, or the cache's internal locking exposed by the
74 * interval between a call to xfs_mru_cache_lookup() and a call to
75 * xfs_mru_cache_done(). In addition, the m_peraglock must be held in read mode
76 * when new elements are added to the cache.
77 *
78 * Combined, these locking rules ensure that no associations will ever exist in
79 * the cache that reference per-ag array elements that have since been
80 * reallocated.
81 */
82STATIC_INLINE int
83xfs_filestream_peek_ag(
84 xfs_mount_t *mp,
85 xfs_agnumber_t agno)
86{
87 return atomic_read(&mp->m_perag[agno].pagf_fstrms);
88}
89
90STATIC_INLINE int
91xfs_filestream_get_ag(
92 xfs_mount_t *mp,
93 xfs_agnumber_t agno)
94{
95 return atomic_inc_return(&mp->m_perag[agno].pagf_fstrms);
96}
97
98STATIC_INLINE int
99xfs_filestream_put_ag(
100 xfs_mount_t *mp,
101 xfs_agnumber_t agno)
102{
103 return atomic_dec_return(&mp->m_perag[agno].pagf_fstrms);
104}
105
106/* allocation selection flags */
107typedef enum xfs_fstrm_alloc {
108 XFS_PICK_USERDATA = 1,
109 XFS_PICK_LOWSPACE = 2,
110} xfs_fstrm_alloc_t;
111
112/* prototypes for filestream.c */
113int xfs_filestream_init(void);
114void xfs_filestream_uninit(void);
115int xfs_filestream_mount(struct xfs_mount *mp);
116void xfs_filestream_unmount(struct xfs_mount *mp);
117void xfs_filestream_flush(struct xfs_mount *mp);
118xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip);
119int xfs_filestream_associate(struct xfs_inode *dip, struct xfs_inode *ip);
120void xfs_filestream_deassociate(struct xfs_inode *ip);
121int xfs_filestream_new_ag(struct xfs_bmalloca *ap, xfs_agnumber_t *agp);
122
123
124/* filestreams for the inode? */
125STATIC_INLINE int
126xfs_inode_is_filestream(
127 struct xfs_inode *ip)
128{
129 return (ip->i_mount->m_flags & XFS_MOUNT_FILESTREAMS) ||
130 xfs_iflags_test(ip, XFS_IFILESTREAM) ||
131 (ip->i_d.di_flags & XFS_DIFLAG_FILESTREAM);
132}
133
134#endif /* __KERNEL__ */
135
136#endif /* __XFS_FILESTREAM_H__ */
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 1b60cfc28be5..ec3c9c27e0de 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -66,6 +66,7 @@ struct fsxattr {
66#define XFS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */ 66#define XFS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */
67#define XFS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */ 67#define XFS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */
68#define XFS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */ 68#define XFS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */
69#define XFS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */
69#define XFS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ 70#define XFS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */
70 71
71/* 72/*
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 2251a49f3e17..432e82347ed6 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -44,6 +44,7 @@
44#include "xfs_trans_space.h" 44#include "xfs_trans_space.h"
45#include "xfs_rtalloc.h" 45#include "xfs_rtalloc.h"
46#include "xfs_rw.h" 46#include "xfs_rw.h"
47#include "xfs_filestream.h"
47 48
48/* 49/*
49 * File system operations 50 * File system operations
@@ -165,6 +166,7 @@ xfs_growfs_data_private(
165 new = nb - mp->m_sb.sb_dblocks; 166 new = nb - mp->m_sb.sb_dblocks;
166 oagcount = mp->m_sb.sb_agcount; 167 oagcount = mp->m_sb.sb_agcount;
167 if (nagcount > oagcount) { 168 if (nagcount > oagcount) {
169 xfs_filestream_flush(mp);
168 down_write(&mp->m_peraglock); 170 down_write(&mp->m_peraglock);
169 mp->m_perag = kmem_realloc(mp->m_perag, 171 mp->m_perag = kmem_realloc(mp->m_perag,
170 sizeof(xfs_perag_t) * nagcount, 172 sizeof(xfs_perag_t) * nagcount,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 8fdd30d9ba56..2ef100be6c4f 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -48,6 +48,7 @@
48#include "xfs_dir2_trace.h" 48#include "xfs_dir2_trace.h"
49#include "xfs_quota.h" 49#include "xfs_quota.h"
50#include "xfs_acl.h" 50#include "xfs_acl.h"
51#include "xfs_filestream.h"
51 52
52#include <linux/log2.h> 53#include <linux/log2.h>
53 54
@@ -818,6 +819,8 @@ _xfs_dic2xflags(
818 flags |= XFS_XFLAG_EXTSZINHERIT; 819 flags |= XFS_XFLAG_EXTSZINHERIT;
819 if (di_flags & XFS_DIFLAG_NODEFRAG) 820 if (di_flags & XFS_DIFLAG_NODEFRAG)
820 flags |= XFS_XFLAG_NODEFRAG; 821 flags |= XFS_XFLAG_NODEFRAG;
822 if (di_flags & XFS_DIFLAG_FILESTREAM)
823 flags |= XFS_XFLAG_FILESTREAM;
821 } 824 }
822 825
823 return flags; 826 return flags;
@@ -1151,7 +1154,7 @@ xfs_ialloc(
1151 /* 1154 /*
1152 * Project ids won't be stored on disk if we are using a version 1 inode. 1155 * Project ids won't be stored on disk if we are using a version 1 inode.
1153 */ 1156 */
1154 if ( (prid != 0) && (ip->i_d.di_version == XFS_DINODE_VERSION_1)) 1157 if ((prid != 0) && (ip->i_d.di_version == XFS_DINODE_VERSION_1))
1155 xfs_bump_ino_vers2(tp, ip); 1158 xfs_bump_ino_vers2(tp, ip);
1156 1159
1157 if (XFS_INHERIT_GID(pip, vp->v_vfsp)) { 1160 if (XFS_INHERIT_GID(pip, vp->v_vfsp)) {
@@ -1196,8 +1199,16 @@ xfs_ialloc(
1196 flags |= XFS_ILOG_DEV; 1199 flags |= XFS_ILOG_DEV;
1197 break; 1200 break;
1198 case S_IFREG: 1201 case S_IFREG:
1202 if (xfs_inode_is_filestream(pip)) {
1203 error = xfs_filestream_associate(pip, ip);
1204 if (error < 0)
1205 return -error;
1206 if (!error)
1207 xfs_iflags_set(ip, XFS_IFILESTREAM);
1208 }
1209 /* fall through */
1199 case S_IFDIR: 1210 case S_IFDIR:
1200 if (unlikely(pip->i_d.di_flags & XFS_DIFLAG_ANY)) { 1211 if (pip->i_d.di_flags & XFS_DIFLAG_ANY) {
1201 uint di_flags = 0; 1212 uint di_flags = 0;
1202 1213
1203 if ((mode & S_IFMT) == S_IFDIR) { 1214 if ((mode & S_IFMT) == S_IFDIR) {
@@ -1234,6 +1245,8 @@ xfs_ialloc(
1234 if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) && 1245 if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
1235 xfs_inherit_nodefrag) 1246 xfs_inherit_nodefrag)
1236 di_flags |= XFS_DIFLAG_NODEFRAG; 1247 di_flags |= XFS_DIFLAG_NODEFRAG;
1248 if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
1249 di_flags |= XFS_DIFLAG_FILESTREAM;
1237 ip->i_d.di_flags |= di_flags; 1250 ip->i_d.di_flags |= di_flags;
1238 } 1251 }
1239 /* FALLTHROUGH */ 1252 /* FALLTHROUGH */
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index f75afecef8e7..d418eeed4ebd 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -379,6 +379,7 @@ xfs_iflags_test(xfs_inode_t *ip, unsigned short flags)
379#define XFS_ISTALE 0x0010 /* inode has been staled */ 379#define XFS_ISTALE 0x0010 /* inode has been staled */
380#define XFS_IRECLAIMABLE 0x0020 /* inode can be reclaimed */ 380#define XFS_IRECLAIMABLE 0x0020 /* inode can be reclaimed */
381#define XFS_INEW 0x0040 381#define XFS_INEW 0x0040
382#define XFS_IFILESTREAM 0x0080 /* inode is in a filestream directory */
382 383
383/* 384/*
384 * Flags for inode locking. 385 * Flags for inode locking.
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 0bca2d422719..76ad74758696 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -66,6 +66,7 @@ struct xfs_bmbt_irec;
66struct xfs_bmap_free; 66struct xfs_bmap_free;
67struct xfs_extdelta; 67struct xfs_extdelta;
68struct xfs_swapext; 68struct xfs_swapext;
69struct xfs_mru_cache;
69 70
70extern struct bhv_vfsops xfs_vfsops; 71extern struct bhv_vfsops xfs_vfsops;
71extern struct bhv_vnodeops xfs_vnodeops; 72extern struct bhv_vnodeops xfs_vnodeops;
@@ -424,6 +425,7 @@ typedef struct xfs_mount {
424 struct notifier_block m_icsb_notifier; /* hotplug cpu notifier */ 425 struct notifier_block m_icsb_notifier; /* hotplug cpu notifier */
425 struct mutex m_icsb_mutex; /* balancer sync lock */ 426 struct mutex m_icsb_mutex; /* balancer sync lock */
426#endif 427#endif
428 struct xfs_mru_cache *m_filestream; /* per-mount filestream data */
427} xfs_mount_t; 429} xfs_mount_t;
428 430
429/* 431/*
@@ -463,6 +465,8 @@ typedef struct xfs_mount {
463 * I/O size in stat() */ 465 * I/O size in stat() */
464#define XFS_MOUNT_NO_PERCPU_SB (1ULL << 23) /* don't use per-cpu superblock 466#define XFS_MOUNT_NO_PERCPU_SB (1ULL << 23) /* don't use per-cpu superblock
465 counters */ 467 counters */
468#define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams
469 allocator */
466 470
467 471
468/* 472/*
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
new file mode 100644
index 000000000000..7deb9e3cbbd3
--- /dev/null
+++ b/fs/xfs/xfs_mru_cache.c
@@ -0,0 +1,608 @@
1/*
2 * Copyright (c) 2006-2007 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_mru_cache.h"
20
21/*
22 * The MRU Cache data structure consists of a data store, an array of lists and
23 * a lock to protect its internal state. At initialisation time, the client
24 * supplies an element lifetime in milliseconds and a group count, as well as a
25 * function pointer to call when deleting elements. A data structure for
26 * queueing up work in the form of timed callbacks is also included.
27 *
28 * The group count controls how many lists are created, and thereby how finely
29 * the elements are grouped in time. When reaping occurs, all the elements in
30 * all the lists whose time has expired are deleted.
31 *
32 * To give an example of how this works in practice, consider a client that
33 * initialises an MRU Cache with a lifetime of ten seconds and a group count of
34 * five. Five internal lists will be created, each representing a two second
35 * period in time. When the first element is added, time zero for the data
36 * structure is initialised to the current time.
37 *
38 * All the elements added in the first two seconds are appended to the first
39 * list. Elements added in the third second go into the second list, and so on.
40 * If an element is accessed at any point, it is removed from its list and
41 * inserted at the head of the current most-recently-used list.
42 *
43 * The reaper function will have nothing to do until at least twelve seconds
44 * have elapsed since the first element was added. The reason for this is that
45 * if it were called at t=11s, there could be elements in the first list that
46 * have only been inactive for nine seconds, so it still does nothing. If it is
47 * called anywhere between t=12 and t=14 seconds, it will delete all the
48 * elements that remain in the first list. It's therefore possible for elements
49 * to remain in the data store even after they've been inactive for up to
50 * (t + t/g) seconds, where t is the inactive element lifetime and g is the
51 * number of groups.
52 *
53 * The above example assumes that the reaper function gets called at least once
54 * every (t/g) seconds. If it is called less frequently, unused elements will
55 * accumulate in the reap list until the reaper function is eventually called.
56 * The current implementation uses work queue callbacks to carefully time the
57 * reaper function calls, so this should happen rarely, if at all.
58 *
59 * From a design perspective, the primary reason for the choice of a list array
60 * representing discrete time intervals is that it's only practical to reap
61 * expired elements in groups of some appreciable size. This automatically
62 * introduces a granularity to element lifetimes, so there's no point storing an
63 * individual timeout with each element that specifies a more precise reap time.
64 * The bonus is a saving of sizeof(long) bytes of memory per element stored.
65 *
66 * The elements could have been stored in just one list, but an array of
67 * counters or pointers would need to be maintained to allow them to be divided
68 * up into discrete time groups. More critically, the process of touching or
69 * removing an element would involve walking large portions of the entire list,
70 * which would have a detrimental effect on performance. The additional memory
71 * requirement for the array of list heads is minimal.
72 *
73 * When an element is touched or deleted, it needs to be removed from its
74 * current list. Doubly linked lists are used to make the list maintenance
75 * portion of these operations O(1). Since reaper timing can be imprecise,
76 * inserts and lookups can occur when there are no free lists available. When
77 * this happens, all the elements on the LRU list need to be migrated to the end
78 * of the reap list. To keep the list maintenance portion of these operations
79 * O(1) also, list tails need to be accessible without walking the entire list.
80 * This is the reason why doubly linked list heads are used.
81 */
82
83/*
84 * An MRU Cache is a dynamic data structure that stores its elements in a way
85 * that allows efficient lookups, but also groups them into discrete time
86 * intervals based on insertion time. This allows elements to be efficiently
87 * and automatically reaped after a fixed period of inactivity.
88 *
89 * When a client data pointer is stored in the MRU Cache it needs to be added to
90 * both the data store and to one of the lists. It must also be possible to
91 * access each of these entries via the other, i.e. to:
92 *
93 * a) Walk a list, removing the corresponding data store entry for each item.
94 * b) Look up a data store entry, then access its list entry directly.
95 *
96 * To achieve both of these goals, each entry must contain both a list entry and
97 * a key, in addition to the user's data pointer. Note that it's not a good
98 * idea to have the client embed one of these structures at the top of their own
99 * data structure, because inserting the same item more than once would most
100 * likely result in a loop in one of the lists. That's a sure-fire recipe for
101 * an infinite loop in the code.
102 */
103typedef struct xfs_mru_cache_elem
104{
105 struct list_head list_node;
106 unsigned long key;
107 void *value;
108} xfs_mru_cache_elem_t;
109
110static kmem_zone_t *xfs_mru_elem_zone;
111static struct workqueue_struct *xfs_mru_reap_wq;
112
113/*
114 * When inserting, destroying or reaping, it's first necessary to update the
115 * lists relative to a particular time. In the case of destroying, that time
116 * will be well in the future to ensure that all items are moved to the reap
117 * list. In all other cases though, the time will be the current time.
118 *
119 * This function enters a loop, moving the contents of the LRU list to the reap
120 * list again and again until either a) the lists are all empty, or b) time zero
121 * has been advanced sufficiently to be within the immediate element lifetime.
122 *
123 * Case a) above is detected by counting how many groups are migrated and
124 * stopping when they've all been moved. Case b) is detected by monitoring the
125 * time_zero field, which is updated as each group is migrated.
126 *
127 * The return value is the earliest time that more migration could be needed, or
128 * zero if there's no need to schedule more work because the lists are empty.
129 */
130STATIC unsigned long
131_xfs_mru_cache_migrate(
132 xfs_mru_cache_t *mru,
133 unsigned long now)
134{
135 unsigned int grp;
136 unsigned int migrated = 0;
137 struct list_head *lru_list;
138
139 /* Nothing to do if the data store is empty. */
140 if (!mru->time_zero)
141 return 0;
142
143 /* While time zero is older than the time spanned by all the lists. */
144 while (mru->time_zero <= now - mru->grp_count * mru->grp_time) {
145
146 /*
147 * If the LRU list isn't empty, migrate its elements to the tail
148 * of the reap list.
149 */
150 lru_list = mru->lists + mru->lru_grp;
151 if (!list_empty(lru_list))
152 list_splice_init(lru_list, mru->reap_list.prev);
153
154 /*
155 * Advance the LRU group number, freeing the old LRU list to
156 * become the new MRU list; advance time zero accordingly.
157 */
158 mru->lru_grp = (mru->lru_grp + 1) % mru->grp_count;
159 mru->time_zero += mru->grp_time;
160
161 /*
162 * If reaping is so far behind that all the elements on all the
163 * lists have been migrated to the reap list, it's now empty.
164 */
165 if (++migrated == mru->grp_count) {
166 mru->lru_grp = 0;
167 mru->time_zero = 0;
168 return 0;
169 }
170 }
171
172 /* Find the first non-empty list from the LRU end. */
173 for (grp = 0; grp < mru->grp_count; grp++) {
174
175 /* Check the grp'th list from the LRU end. */
176 lru_list = mru->lists + ((mru->lru_grp + grp) % mru->grp_count);
177 if (!list_empty(lru_list))
178 return mru->time_zero +
179 (mru->grp_count + grp) * mru->grp_time;
180 }
181
182 /* All the lists must be empty. */
183 mru->lru_grp = 0;
184 mru->time_zero = 0;
185 return 0;
186}
187
188/*
189 * When inserting or doing a lookup, an element needs to be inserted into the
190 * MRU list. The lists must be migrated first to ensure that they're
191 * up-to-date, otherwise the new element could be given a shorter lifetime in
192 * the cache than it should.
193 */
194STATIC void
195_xfs_mru_cache_list_insert(
196 xfs_mru_cache_t *mru,
197 xfs_mru_cache_elem_t *elem)
198{
199 unsigned int grp = 0;
200 unsigned long now = jiffies;
201
202 /*
203 * If the data store is empty, initialise time zero, leave grp set to
204 * zero and start the work queue timer if necessary. Otherwise, set grp
205 * to the number of group times that have elapsed since time zero.
206 */
207 if (!_xfs_mru_cache_migrate(mru, now)) {
208 mru->time_zero = now;
209 if (!mru->next_reap)
210 mru->next_reap = mru->grp_count * mru->grp_time;
211 } else {
212 grp = (now - mru->time_zero) / mru->grp_time;
213 grp = (mru->lru_grp + grp) % mru->grp_count;
214 }
215
216 /* Insert the element at the tail of the corresponding list. */
217 list_add_tail(&elem->list_node, mru->lists + grp);
218}
219
220/*
221 * When destroying or reaping, all the elements that were migrated to the reap
222 * list need to be deleted. For each element this involves removing it from the
223 * data store, removing it from the reap list, calling the client's free
224 * function and deleting the element from the element zone.
225 */
226STATIC void
227_xfs_mru_cache_clear_reap_list(
228 xfs_mru_cache_t *mru)
229{
230 xfs_mru_cache_elem_t *elem, *next;
231 struct list_head tmp;
232
233 INIT_LIST_HEAD(&tmp);
234 list_for_each_entry_safe(elem, next, &mru->reap_list, list_node) {
235
236 /* Remove the element from the data store. */
237 radix_tree_delete(&mru->store, elem->key);
238
239 /*
240 * remove to temp list so it can be freed without
241 * needing to hold the lock
242 */
243 list_move(&elem->list_node, &tmp);
244 }
245 mutex_spinunlock(&mru->lock, 0);
246
247 list_for_each_entry_safe(elem, next, &tmp, list_node) {
248
249 /* Remove the element from the reap list. */
250 list_del_init(&elem->list_node);
251
252 /* Call the client's free function with the key and value pointer. */
253 mru->free_func(elem->key, elem->value);
254
255 /* Free the element structure. */
256 kmem_zone_free(xfs_mru_elem_zone, elem);
257 }
258
259 mutex_spinlock(&mru->lock);
260}
261
262/*
263 * We fire the reap timer every group expiry interval so
264 * we always have a reaper ready to run. This makes shutdown
265 * and flushing of the reaper easy to do. Hence we need to
266 * keep when the next reap must occur so we can determine
267 * at each interval whether there is anything we need to do.
268 */
269STATIC void
270_xfs_mru_cache_reap(
271 struct work_struct *work)
272{
273 xfs_mru_cache_t *mru = container_of(work, xfs_mru_cache_t, work.work);
274 unsigned long now;
275
276 ASSERT(mru && mru->lists);
277 if (!mru || !mru->lists)
278 return;
279
280 mutex_spinlock(&mru->lock);
281 now = jiffies;
282 if (mru->reap_all ||
283 (mru->next_reap && time_after(now, mru->next_reap))) {
284 if (mru->reap_all)
285 now += mru->grp_count * mru->grp_time * 2;
286 mru->next_reap = _xfs_mru_cache_migrate(mru, now);
287 _xfs_mru_cache_clear_reap_list(mru);
288 }
289
290 /*
291 * the process that triggered the reap_all is responsible
292 * for restating the periodic reap if it is required.
293 */
294 if (!mru->reap_all)
295 queue_delayed_work(xfs_mru_reap_wq, &mru->work, mru->grp_time);
296 mru->reap_all = 0;
297 mutex_spinunlock(&mru->lock, 0);
298}
299
300int
301xfs_mru_cache_init(void)
302{
303 xfs_mru_elem_zone = kmem_zone_init(sizeof(xfs_mru_cache_elem_t),
304 "xfs_mru_cache_elem");
305 if (!xfs_mru_elem_zone)
306 return ENOMEM;
307
308 xfs_mru_reap_wq = create_singlethread_workqueue("xfs_mru_cache");
309 if (!xfs_mru_reap_wq) {
310 kmem_zone_destroy(xfs_mru_elem_zone);
311 return ENOMEM;
312 }
313
314 return 0;
315}
316
317void
318xfs_mru_cache_uninit(void)
319{
320 destroy_workqueue(xfs_mru_reap_wq);
321 kmem_zone_destroy(xfs_mru_elem_zone);
322}
323
324/*
325 * To initialise a struct xfs_mru_cache pointer, call xfs_mru_cache_create()
326 * with the address of the pointer, a lifetime value in milliseconds, a group
327 * count and a free function to use when deleting elements. This function
328 * returns 0 if the initialisation was successful.
329 */
330int
331xfs_mru_cache_create(
332 xfs_mru_cache_t **mrup,
333 unsigned int lifetime_ms,
334 unsigned int grp_count,
335 xfs_mru_cache_free_func_t free_func)
336{
337 xfs_mru_cache_t *mru = NULL;
338 int err = 0, grp;
339 unsigned int grp_time;
340
341 if (mrup)
342 *mrup = NULL;
343
344 if (!mrup || !grp_count || !lifetime_ms || !free_func)
345 return EINVAL;
346
347 if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count))
348 return EINVAL;
349
350 if (!(mru = kmem_zalloc(sizeof(*mru), KM_SLEEP)))
351 return ENOMEM;
352
353 /* An extra list is needed to avoid reaping up to a grp_time early. */
354 mru->grp_count = grp_count + 1;
355 mru->lists = kmem_alloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP);
356
357 if (!mru->lists) {
358 err = ENOMEM;
359 goto exit;
360 }
361
362 for (grp = 0; grp < mru->grp_count; grp++)
363 INIT_LIST_HEAD(mru->lists + grp);
364
365 /*
366 * We use GFP_KERNEL radix tree preload and do inserts under a
367 * spinlock so GFP_ATOMIC is appropriate for the radix tree itself.
368 */
369 INIT_RADIX_TREE(&mru->store, GFP_ATOMIC);
370 INIT_LIST_HEAD(&mru->reap_list);
371 spinlock_init(&mru->lock, "xfs_mru_cache");
372 INIT_DELAYED_WORK(&mru->work, _xfs_mru_cache_reap);
373
374 mru->grp_time = grp_time;
375 mru->free_func = free_func;
376
377 /* start up the reaper event */
378 mru->next_reap = 0;
379 mru->reap_all = 0;
380 queue_delayed_work(xfs_mru_reap_wq, &mru->work, mru->grp_time);
381
382 *mrup = mru;
383
384exit:
385 if (err && mru && mru->lists)
386 kmem_free(mru->lists, mru->grp_count * sizeof(*mru->lists));
387 if (err && mru)
388 kmem_free(mru, sizeof(*mru));
389
390 return err;
391}
392
393/*
394 * Call xfs_mru_cache_flush() to flush out all cached entries, calling their
395 * free functions as they're deleted. When this function returns, the caller is
396 * guaranteed that all the free functions for all the elements have finished
397 * executing.
398 *
399 * While we are flushing, we stop the periodic reaper event from triggering.
400 * Normally, we want to restart this periodic event, but if we are shutting
401 * down the cache we do not want it restarted. hence the restart parameter
402 * where 0 = do not restart reaper and 1 = restart reaper.
403 */
404void
405xfs_mru_cache_flush(
406 xfs_mru_cache_t *mru,
407 int restart)
408{
409 if (!mru || !mru->lists)
410 return;
411
412 cancel_rearming_delayed_workqueue(xfs_mru_reap_wq, &mru->work);
413
414 mutex_spinlock(&mru->lock);
415 mru->reap_all = 1;
416 mutex_spinunlock(&mru->lock, 0);
417
418 queue_work(xfs_mru_reap_wq, &mru->work.work);
419 flush_workqueue(xfs_mru_reap_wq);
420
421 mutex_spinlock(&mru->lock);
422 WARN_ON_ONCE(mru->reap_all != 0);
423 mru->reap_all = 0;
424 if (restart)
425 queue_delayed_work(xfs_mru_reap_wq, &mru->work, mru->grp_time);
426 mutex_spinunlock(&mru->lock, 0);
427}
428
429void
430xfs_mru_cache_destroy(
431 xfs_mru_cache_t *mru)
432{
433 if (!mru || !mru->lists)
434 return;
435
436 /* we don't want the reaper to restart here */
437 xfs_mru_cache_flush(mru, 0);
438
439 kmem_free(mru->lists, mru->grp_count * sizeof(*mru->lists));
440 kmem_free(mru, sizeof(*mru));
441}
442
443/*
444 * To insert an element, call xfs_mru_cache_insert() with the data store, the
445 * element's key and the client data pointer. This function returns 0 on
446 * success or ENOMEM if memory for the data element couldn't be allocated.
447 */
448int
449xfs_mru_cache_insert(
450 xfs_mru_cache_t *mru,
451 unsigned long key,
452 void *value)
453{
454 xfs_mru_cache_elem_t *elem;
455
456 ASSERT(mru && mru->lists);
457 if (!mru || !mru->lists)
458 return EINVAL;
459
460 elem = kmem_zone_zalloc(xfs_mru_elem_zone, KM_SLEEP);
461 if (!elem)
462 return ENOMEM;
463
464 if (radix_tree_preload(GFP_KERNEL)) {
465 kmem_zone_free(xfs_mru_elem_zone, elem);
466 return ENOMEM;
467 }
468
469 INIT_LIST_HEAD(&elem->list_node);
470 elem->key = key;
471 elem->value = value;
472
473 mutex_spinlock(&mru->lock);
474
475 radix_tree_insert(&mru->store, key, elem);
476 radix_tree_preload_end();
477 _xfs_mru_cache_list_insert(mru, elem);
478
479 mutex_spinunlock(&mru->lock, 0);
480
481 return 0;
482}
483
484/*
485 * To remove an element without calling the free function, call
486 * xfs_mru_cache_remove() with the data store and the element's key. On success
487 * the client data pointer for the removed element is returned, otherwise this
488 * function will return a NULL pointer.
489 */
490void *
491xfs_mru_cache_remove(
492 xfs_mru_cache_t *mru,
493 unsigned long key)
494{
495 xfs_mru_cache_elem_t *elem;
496 void *value = NULL;
497
498 ASSERT(mru && mru->lists);
499 if (!mru || !mru->lists)
500 return NULL;
501
502 mutex_spinlock(&mru->lock);
503 elem = radix_tree_delete(&mru->store, key);
504 if (elem) {
505 value = elem->value;
506 list_del(&elem->list_node);
507 }
508
509 mutex_spinunlock(&mru->lock, 0);
510
511 if (elem)
512 kmem_zone_free(xfs_mru_elem_zone, elem);
513
514 return value;
515}
516
517/*
518 * To remove and element and call the free function, call xfs_mru_cache_delete()
519 * with the data store and the element's key.
520 */
521void
522xfs_mru_cache_delete(
523 xfs_mru_cache_t *mru,
524 unsigned long key)
525{
526 void *value = xfs_mru_cache_remove(mru, key);
527
528 if (value)
529 mru->free_func(key, value);
530}
531
532/*
533 * To look up an element using its key, call xfs_mru_cache_lookup() with the
534 * data store and the element's key. If found, the element will be moved to the
535 * head of the MRU list to indicate that it's been touched.
536 *
537 * The internal data structures are protected by a spinlock that is STILL HELD
538 * when this function returns. Call xfs_mru_cache_done() to release it. Note
539 * that it is not safe to call any function that might sleep in the interim.
540 *
541 * The implementation could have used reference counting to avoid this
542 * restriction, but since most clients simply want to get, set or test a member
543 * of the returned data structure, the extra per-element memory isn't warranted.
544 *
545 * If the element isn't found, this function returns NULL and the spinlock is
546 * released. xfs_mru_cache_done() should NOT be called when this occurs.
547 */
548void *
549xfs_mru_cache_lookup(
550 xfs_mru_cache_t *mru,
551 unsigned long key)
552{
553 xfs_mru_cache_elem_t *elem;
554
555 ASSERT(mru && mru->lists);
556 if (!mru || !mru->lists)
557 return NULL;
558
559 mutex_spinlock(&mru->lock);
560 elem = radix_tree_lookup(&mru->store, key);
561 if (elem) {
562 list_del(&elem->list_node);
563 _xfs_mru_cache_list_insert(mru, elem);
564 }
565 else
566 mutex_spinunlock(&mru->lock, 0);
567
568 return elem ? elem->value : NULL;
569}
570
571/*
572 * To look up an element using its key, but leave its location in the internal
573 * lists alone, call xfs_mru_cache_peek(). If the element isn't found, this
574 * function returns NULL.
575 *
576 * See the comments above the declaration of the xfs_mru_cache_lookup() function
577 * for important locking information pertaining to this call.
578 */
579void *
580xfs_mru_cache_peek(
581 xfs_mru_cache_t *mru,
582 unsigned long key)
583{
584 xfs_mru_cache_elem_t *elem;
585
586 ASSERT(mru && mru->lists);
587 if (!mru || !mru->lists)
588 return NULL;
589
590 mutex_spinlock(&mru->lock);
591 elem = radix_tree_lookup(&mru->store, key);
592 if (!elem)
593 mutex_spinunlock(&mru->lock, 0);
594
595 return elem ? elem->value : NULL;
596}
597
598/*
599 * To release the internal data structure spinlock after having performed an
600 * xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done()
601 * with the data store pointer.
602 */
603void
604xfs_mru_cache_done(
605 xfs_mru_cache_t *mru)
606{
607 mutex_spinunlock(&mru->lock, 0);
608}
diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h
new file mode 100644
index 000000000000..624fd10ee8e5
--- /dev/null
+++ b/fs/xfs/xfs_mru_cache.h
@@ -0,0 +1,57 @@
1/*
2 * Copyright (c) 2006-2007 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_MRU_CACHE_H__
19#define __XFS_MRU_CACHE_H__
20
21
22/* Function pointer type for callback to free a client's data pointer. */
23typedef void (*xfs_mru_cache_free_func_t)(unsigned long, void*);
24
25typedef struct xfs_mru_cache
26{
27 struct radix_tree_root store; /* Core storage data structure. */
28 struct list_head *lists; /* Array of lists, one per grp. */
29 struct list_head reap_list; /* Elements overdue for reaping. */
30 spinlock_t lock; /* Lock to protect this struct. */
31 unsigned int grp_count; /* Number of discrete groups. */
32 unsigned int grp_time; /* Time period spanned by grps. */
33 unsigned int lru_grp; /* Group containing time zero. */
34 unsigned long time_zero; /* Time first element was added. */
35 unsigned long next_reap; /* Time that the reaper should
36 next do something. */
37 unsigned int reap_all; /* if set, reap all lists */
38 xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */
39 struct delayed_work work; /* Workqueue data for reaping. */
40} xfs_mru_cache_t;
41
42int xfs_mru_cache_init(void);
43void xfs_mru_cache_uninit(void);
44int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms,
45 unsigned int grp_count,
46 xfs_mru_cache_free_func_t free_func);
47void xfs_mru_cache_flush(xfs_mru_cache_t *mru, int restart);
48void xfs_mru_cache_destroy(struct xfs_mru_cache *mru);
49int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key,
50 void *value);
51void * xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key);
52void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key);
53void *xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key);
54void *xfs_mru_cache_peek(struct xfs_mru_cache *mru, unsigned long key);
55void xfs_mru_cache_done(struct xfs_mru_cache *mru);
56
57#endif /* __XFS_MRU_CACHE_H__ */
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index c343fde10ef9..11f5ea29a038 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -51,6 +51,8 @@
51#include "xfs_acl.h" 51#include "xfs_acl.h"
52#include "xfs_attr.h" 52#include "xfs_attr.h"
53#include "xfs_clnt.h" 53#include "xfs_clnt.h"
54#include "xfs_mru_cache.h"
55#include "xfs_filestream.h"
54#include "xfs_fsops.h" 56#include "xfs_fsops.h"
55 57
56STATIC int xfs_sync(bhv_desc_t *, int, cred_t *); 58STATIC int xfs_sync(bhv_desc_t *, int, cred_t *);
@@ -81,6 +83,8 @@ xfs_init(void)
81 xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf"); 83 xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf");
82 xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork"); 84 xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork");
83 xfs_acl_zone_init(xfs_acl_zone, "xfs_acl"); 85 xfs_acl_zone_init(xfs_acl_zone, "xfs_acl");
86 xfs_mru_cache_init();
87 xfs_filestream_init();
84 88
85 /* 89 /*
86 * The size of the zone allocated buf log item is the maximum 90 * The size of the zone allocated buf log item is the maximum
@@ -164,6 +168,8 @@ xfs_cleanup(void)
164 xfs_cleanup_procfs(); 168 xfs_cleanup_procfs();
165 xfs_sysctl_unregister(); 169 xfs_sysctl_unregister();
166 xfs_refcache_destroy(); 170 xfs_refcache_destroy();
171 xfs_filestream_uninit();
172 xfs_mru_cache_uninit();
167 xfs_acl_zone_destroy(xfs_acl_zone); 173 xfs_acl_zone_destroy(xfs_acl_zone);
168 174
169#ifdef XFS_DIR2_TRACE 175#ifdef XFS_DIR2_TRACE
@@ -320,6 +326,9 @@ xfs_start_flags(
320 else 326 else
321 mp->m_flags &= ~XFS_MOUNT_BARRIER; 327 mp->m_flags &= ~XFS_MOUNT_BARRIER;
322 328
329 if (ap->flags2 & XFSMNT2_FILESTREAMS)
330 mp->m_flags |= XFS_MOUNT_FILESTREAMS;
331
323 return 0; 332 return 0;
324} 333}
325 334
@@ -518,6 +527,9 @@ xfs_mount(
518 if (mp->m_flags & XFS_MOUNT_BARRIER) 527 if (mp->m_flags & XFS_MOUNT_BARRIER)
519 xfs_mountfs_check_barriers(mp); 528 xfs_mountfs_check_barriers(mp);
520 529
530 if ((error = xfs_filestream_mount(mp)))
531 goto error2;
532
521 error = XFS_IOINIT(vfsp, args, flags); 533 error = XFS_IOINIT(vfsp, args, flags);
522 if (error) 534 if (error)
523 goto error2; 535 goto error2;
@@ -575,6 +587,13 @@ xfs_unmount(
575 */ 587 */
576 xfs_refcache_purge_mp(mp); 588 xfs_refcache_purge_mp(mp);
577 589
590 /*
591 * Blow away any referenced inode in the filestreams cache.
592 * This can and will cause log traffic as inodes go inactive
593 * here.
594 */
595 xfs_filestream_unmount(mp);
596
578 XFS_bflush(mp->m_ddev_targp); 597 XFS_bflush(mp->m_ddev_targp);
579 error = xfs_unmount_flush(mp, 0); 598 error = xfs_unmount_flush(mp, 0);
580 if (error) 599 if (error)
@@ -694,6 +713,7 @@ xfs_mntupdate(
694 mp->m_flags &= ~XFS_MOUNT_BARRIER; 713 mp->m_flags &= ~XFS_MOUNT_BARRIER;
695 } 714 }
696 } else if (!(vfsp->vfs_flag & VFS_RDONLY)) { /* rw -> ro */ 715 } else if (!(vfsp->vfs_flag & VFS_RDONLY)) { /* rw -> ro */
716 xfs_filestream_flush(mp);
697 bhv_vfs_sync(vfsp, SYNC_DATA_QUIESCE, NULL); 717 bhv_vfs_sync(vfsp, SYNC_DATA_QUIESCE, NULL);
698 xfs_attr_quiesce(mp); 718 xfs_attr_quiesce(mp);
699 vfsp->vfs_flag |= VFS_RDONLY; 719 vfsp->vfs_flag |= VFS_RDONLY;
@@ -909,6 +929,9 @@ xfs_sync(
909{ 929{
910 xfs_mount_t *mp = XFS_BHVTOM(bdp); 930 xfs_mount_t *mp = XFS_BHVTOM(bdp);
911 931
932 if (flags & SYNC_IOWAIT)
933 xfs_filestream_flush(mp);
934
912 return xfs_syncsub(mp, flags, NULL); 935 return xfs_syncsub(mp, flags, NULL);
913} 936}
914 937
@@ -1659,6 +1682,7 @@ xfs_vget(
1659 * in stat(). */ 1682 * in stat(). */
1660#define MNTOPT_ATTR2 "attr2" /* do use attr2 attribute format */ 1683#define MNTOPT_ATTR2 "attr2" /* do use attr2 attribute format */
1661#define MNTOPT_NOATTR2 "noattr2" /* do not use attr2 attribute format */ 1684#define MNTOPT_NOATTR2 "noattr2" /* do not use attr2 attribute format */
1685#define MNTOPT_FILESTREAM "filestreams" /* use filestreams allocator */
1662 1686
1663STATIC unsigned long 1687STATIC unsigned long
1664suffix_strtoul(char *s, char **endp, unsigned int base) 1688suffix_strtoul(char *s, char **endp, unsigned int base)
@@ -1845,6 +1869,8 @@ xfs_parseargs(
1845 args->flags |= XFSMNT_ATTR2; 1869 args->flags |= XFSMNT_ATTR2;
1846 } else if (!strcmp(this_char, MNTOPT_NOATTR2)) { 1870 } else if (!strcmp(this_char, MNTOPT_NOATTR2)) {
1847 args->flags &= ~XFSMNT_ATTR2; 1871 args->flags &= ~XFSMNT_ATTR2;
1872 } else if (!strcmp(this_char, MNTOPT_FILESTREAM)) {
1873 args->flags2 |= XFSMNT2_FILESTREAMS;
1848 } else if (!strcmp(this_char, "osyncisdsync")) { 1874 } else if (!strcmp(this_char, "osyncisdsync")) {
1849 /* no-op, this is now the default */ 1875 /* no-op, this is now the default */
1850 cmn_err(CE_WARN, 1876 cmn_err(CE_WARN,
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 2067d0b0a10e..60fd0be90a16 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -51,6 +51,7 @@
51#include "xfs_refcache.h" 51#include "xfs_refcache.h"
52#include "xfs_trans_space.h" 52#include "xfs_trans_space.h"
53#include "xfs_log_priv.h" 53#include "xfs_log_priv.h"
54#include "xfs_filestream.h"
54 55
55STATIC int 56STATIC int
56xfs_open( 57xfs_open(
@@ -783,6 +784,8 @@ xfs_setattr(
783 di_flags |= XFS_DIFLAG_PROJINHERIT; 784 di_flags |= XFS_DIFLAG_PROJINHERIT;
784 if (vap->va_xflags & XFS_XFLAG_NODEFRAG) 785 if (vap->va_xflags & XFS_XFLAG_NODEFRAG)
785 di_flags |= XFS_DIFLAG_NODEFRAG; 786 di_flags |= XFS_DIFLAG_NODEFRAG;
787 if (vap->va_xflags & XFS_XFLAG_FILESTREAM)
788 di_flags |= XFS_DIFLAG_FILESTREAM;
786 if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { 789 if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
787 if (vap->va_xflags & XFS_XFLAG_RTINHERIT) 790 if (vap->va_xflags & XFS_XFLAG_RTINHERIT)
788 di_flags |= XFS_DIFLAG_RTINHERIT; 791 di_flags |= XFS_DIFLAG_RTINHERIT;
@@ -1536,7 +1539,17 @@ xfs_release(
1536 if (vp->v_vfsp->vfs_flag & VFS_RDONLY) 1539 if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
1537 return 0; 1540 return 0;
1538 1541
1539 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 1542 if (!XFS_FORCED_SHUTDOWN(mp)) {
1543 /*
1544 * If we are using filestreams, and we have an unlinked
1545 * file that we are processing the last close on, then nothing
1546 * will be able to reopen and write to this file. Purge this
1547 * inode from the filestreams cache so that it doesn't delay
1548 * teardown of the inode.
1549 */
1550 if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
1551 xfs_filestream_deassociate(ip);
1552
1540 /* 1553 /*
1541 * If we previously truncated this file and removed old data 1554 * If we previously truncated this file and removed old data
1542 * in the process, we want to initiate "early" writeout on 1555 * in the process, we want to initiate "early" writeout on
@@ -1551,7 +1564,6 @@ xfs_release(
1551 bhv_vop_flush_pages(vp, 0, -1, XFS_B_ASYNC, FI_NONE); 1564 bhv_vop_flush_pages(vp, 0, -1, XFS_B_ASYNC, FI_NONE);
1552 } 1565 }
1553 1566
1554
1555#ifdef HAVE_REFCACHE 1567#ifdef HAVE_REFCACHE
1556 /* If we are in the NFS reference cache then don't do this now */ 1568 /* If we are in the NFS reference cache then don't do this now */
1557 if (ip->i_refcache) 1569 if (ip->i_refcache)
@@ -2541,6 +2553,15 @@ xfs_remove(
2541 */ 2553 */
2542 xfs_refcache_purge_ip(ip); 2554 xfs_refcache_purge_ip(ip);
2543 2555
2556 /*
2557 * If we are using filestreams, kill the stream association.
2558 * If the file is still open it may get a new one but that
2559 * will get killed on last close in xfs_close() so we don't
2560 * have to worry about that.
2561 */
2562 if (link_zero && xfs_inode_is_filestream(ip))
2563 xfs_filestream_deassociate(ip);
2564
2544 vn_trace_exit(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address); 2565 vn_trace_exit(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
2545 2566
2546 /* 2567 /*