diff options
author | Ryan Ding <ryan.ding@oracle.com> | 2016-03-25 17:21:06 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-03-25 19:37:42 -0400 |
commit | 4506cfb6f8cad594ac73e0df2b2961ca10dbd25e (patch) | |
tree | 2dff95967b493d1629ad384ec593895564c64c03 /fs/ocfs2/aops.c | |
parent | 2de6a3c73180ef4071b45185756be51a6c767924 (diff) |
ocfs2: record UNWRITTEN extents when populate write desc
To support direct io in ocfs2_write_begin_nolock & ocfs2_write_end_nolock.
There is still one issue in the direct write procedure.
phase 1: alloc extent with UNWRITTEN flag
phase 2: submit direct data to disk, add zero page to page cache
phase 3: clear UNWRITTEN flag when data has been written to disk
When there are 2 direct write A(0~3KB),B(4~7KB) writing to the same
cluster 0~7KB (cluster size 8KB). Write request A arrive phase 2 first,
it will zero the region (4~7KB). Before request A enter to phase 3,
request B arrive phase 2, it will zero region (0~3KB). This is just like
request B steps request A.
To resolve this issue, we should let request B knows this cluster is already
under zero, to prevent it from steps the previous write request.
This patch will add function ocfs2_unwritten_check() to do this job. It
will record all clusters that are under direct write(it will be recorded
in the 'ip_unwritten_list' member of inode info), and prevent the later
direct write writing to the same cluster to do the zero work again.
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs/ocfs2/aops.c')
-rw-r--r-- | fs/ocfs2/aops.c | 104 |
1 files changed, 99 insertions, 5 deletions
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 7b268c357cf3..c29d06634fd6 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c | |||
@@ -1201,6 +1201,13 @@ next_bh: | |||
1201 | 1201 | ||
1202 | #define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE) | 1202 | #define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE) |
1203 | 1203 | ||
1204 | struct ocfs2_unwritten_extent { | ||
1205 | struct list_head ue_node; | ||
1206 | struct list_head ue_ip_node; | ||
1207 | u32 ue_cpos; | ||
1208 | u32 ue_phys; | ||
1209 | }; | ||
1210 | |||
1204 | /* | 1211 | /* |
1205 | * Describe the state of a single cluster to be written to. | 1212 | * Describe the state of a single cluster to be written to. |
1206 | */ | 1213 | */ |
@@ -1275,6 +1282,8 @@ struct ocfs2_write_ctxt { | |||
1275 | struct buffer_head *w_di_bh; | 1282 | struct buffer_head *w_di_bh; |
1276 | 1283 | ||
1277 | struct ocfs2_cached_dealloc_ctxt w_dealloc; | 1284 | struct ocfs2_cached_dealloc_ctxt w_dealloc; |
1285 | |||
1286 | struct list_head w_unwritten_list; | ||
1278 | }; | 1287 | }; |
1279 | 1288 | ||
1280 | void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) | 1289 | void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) |
@@ -1313,8 +1322,25 @@ static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc) | |||
1313 | ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); | 1322 | ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); |
1314 | } | 1323 | } |
1315 | 1324 | ||
1316 | static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) | 1325 | static void ocfs2_free_unwritten_list(struct inode *inode, |
1326 | struct list_head *head) | ||
1327 | { | ||
1328 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
1329 | struct ocfs2_unwritten_extent *dz = NULL, *tmp = NULL; | ||
1330 | |||
1331 | list_for_each_entry_safe(dz, tmp, head, ue_node) { | ||
1332 | list_del(&dz->ue_node); | ||
1333 | spin_lock(&oi->ip_lock); | ||
1334 | list_del(&dz->ue_ip_node); | ||
1335 | spin_unlock(&oi->ip_lock); | ||
1336 | kfree(dz); | ||
1337 | } | ||
1338 | } | ||
1339 | |||
1340 | static void ocfs2_free_write_ctxt(struct inode *inode, | ||
1341 | struct ocfs2_write_ctxt *wc) | ||
1317 | { | 1342 | { |
1343 | ocfs2_free_unwritten_list(inode, &wc->w_unwritten_list); | ||
1318 | ocfs2_unlock_pages(wc); | 1344 | ocfs2_unlock_pages(wc); |
1319 | brelse(wc->w_di_bh); | 1345 | brelse(wc->w_di_bh); |
1320 | kfree(wc); | 1346 | kfree(wc); |
@@ -1346,6 +1372,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, | |||
1346 | wc->w_large_pages = 0; | 1372 | wc->w_large_pages = 0; |
1347 | 1373 | ||
1348 | ocfs2_init_dealloc_ctxt(&wc->w_dealloc); | 1374 | ocfs2_init_dealloc_ctxt(&wc->w_dealloc); |
1375 | INIT_LIST_HEAD(&wc->w_unwritten_list); | ||
1349 | 1376 | ||
1350 | *wcp = wc; | 1377 | *wcp = wc; |
1351 | 1378 | ||
@@ -1796,6 +1823,66 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, | |||
1796 | } | 1823 | } |
1797 | 1824 | ||
1798 | /* | 1825 | /* |
1826 | * Check if this extent is marked UNWRITTEN by direct io. If so, we need not to | ||
1827 | * do the zero work. And should not to clear UNWRITTEN since it will be cleared | ||
1828 | * by the direct io procedure. | ||
1829 | * If this is a new extent that allocated by direct io, we should mark it in | ||
1830 | * the ip_unwritten_list. | ||
1831 | */ | ||
1832 | static int ocfs2_unwritten_check(struct inode *inode, | ||
1833 | struct ocfs2_write_ctxt *wc, | ||
1834 | struct ocfs2_write_cluster_desc *desc) | ||
1835 | { | ||
1836 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
1837 | struct ocfs2_unwritten_extent *dz = NULL, *new = NULL; | ||
1838 | int ret = 0; | ||
1839 | |||
1840 | if (!desc->c_needs_zero) | ||
1841 | return 0; | ||
1842 | |||
1843 | retry: | ||
1844 | spin_lock(&oi->ip_lock); | ||
1845 | /* Needs not to zero no metter buffer or direct. The one who is zero | ||
1846 | * the cluster is doing zero. And he will clear unwritten after all | ||
1847 | * cluster io finished. */ | ||
1848 | list_for_each_entry(dz, &oi->ip_unwritten_list, ue_ip_node) { | ||
1849 | if (desc->c_cpos == dz->ue_cpos) { | ||
1850 | BUG_ON(desc->c_new); | ||
1851 | desc->c_needs_zero = 0; | ||
1852 | desc->c_clear_unwritten = 0; | ||
1853 | goto unlock; | ||
1854 | } | ||
1855 | } | ||
1856 | |||
1857 | if (wc->w_type != OCFS2_WRITE_DIRECT) | ||
1858 | goto unlock; | ||
1859 | |||
1860 | if (new == NULL) { | ||
1861 | spin_unlock(&oi->ip_lock); | ||
1862 | new = kmalloc(sizeof(struct ocfs2_unwritten_extent), | ||
1863 | GFP_NOFS); | ||
1864 | if (new == NULL) { | ||
1865 | ret = -ENOMEM; | ||
1866 | goto out; | ||
1867 | } | ||
1868 | goto retry; | ||
1869 | } | ||
1870 | /* This direct write will doing zero. */ | ||
1871 | new->ue_cpos = desc->c_cpos; | ||
1872 | new->ue_phys = desc->c_phys; | ||
1873 | desc->c_clear_unwritten = 0; | ||
1874 | list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list); | ||
1875 | list_add_tail(&new->ue_node, &wc->w_unwritten_list); | ||
1876 | new = NULL; | ||
1877 | unlock: | ||
1878 | spin_unlock(&oi->ip_lock); | ||
1879 | out: | ||
1880 | if (new) | ||
1881 | kfree(new); | ||
1882 | return ret; | ||
1883 | } | ||
1884 | |||
1885 | /* | ||
1799 | * Populate each single-cluster write descriptor in the write context | 1886 | * Populate each single-cluster write descriptor in the write context |
1800 | * with information about the i/o to be done. | 1887 | * with information about the i/o to be done. |
1801 | * | 1888 | * |
@@ -1879,6 +1966,12 @@ static int ocfs2_populate_write_desc(struct inode *inode, | |||
1879 | desc->c_needs_zero = 1; | 1966 | desc->c_needs_zero = 1; |
1880 | } | 1967 | } |
1881 | 1968 | ||
1969 | ret = ocfs2_unwritten_check(inode, wc, desc); | ||
1970 | if (ret) { | ||
1971 | mlog_errno(ret); | ||
1972 | goto out; | ||
1973 | } | ||
1974 | |||
1882 | num_clusters--; | 1975 | num_clusters--; |
1883 | } | 1976 | } |
1884 | 1977 | ||
@@ -2215,9 +2308,8 @@ try_again: | |||
2215 | * and non-sparse clusters we just extended. For non-sparse writes, | 2308 | * and non-sparse clusters we just extended. For non-sparse writes, |
2216 | * we know zeros will only be needed in the first and/or last cluster. | 2309 | * we know zeros will only be needed in the first and/or last cluster. |
2217 | */ | 2310 | */ |
2218 | if (clusters_to_alloc || extents_to_split || | 2311 | if (wc->w_clen && (wc->w_desc[0].c_needs_zero || |
2219 | (wc->w_clen && (wc->w_desc[0].c_needs_zero || | 2312 | wc->w_desc[wc->w_clen - 1].c_needs_zero)) |
2220 | wc->w_desc[wc->w_clen - 1].c_needs_zero))) | ||
2221 | cluster_of_pages = 1; | 2313 | cluster_of_pages = 1; |
2222 | else | 2314 | else |
2223 | cluster_of_pages = 0; | 2315 | cluster_of_pages = 0; |
@@ -2296,7 +2388,7 @@ out_commit: | |||
2296 | ocfs2_commit_trans(osb, handle); | 2388 | ocfs2_commit_trans(osb, handle); |
2297 | 2389 | ||
2298 | out: | 2390 | out: |
2299 | ocfs2_free_write_ctxt(wc); | 2391 | ocfs2_free_write_ctxt(inode, wc); |
2300 | 2392 | ||
2301 | if (data_ac) { | 2393 | if (data_ac) { |
2302 | ocfs2_free_alloc_context(data_ac); | 2394 | ocfs2_free_alloc_context(data_ac); |
@@ -2406,6 +2498,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping, | |||
2406 | handle_t *handle = wc->w_handle; | 2498 | handle_t *handle = wc->w_handle; |
2407 | struct page *tmppage; | 2499 | struct page *tmppage; |
2408 | 2500 | ||
2501 | BUG_ON(!list_empty(&wc->w_unwritten_list)); | ||
2502 | |||
2409 | if (handle) { | 2503 | if (handle) { |
2410 | ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), | 2504 | ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), |
2411 | wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE); | 2505 | wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE); |