From e92f771f29b022eae33ef58c02dc0e18d7f7b6c3 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 24 Jun 2020 13:25:51 -0700 Subject: [PATCH 001/580] xfs: drop I_DIRTY_TIME_EXPIRED Fix missing cherry-pick. Fixes: 00f6b03b4176 ("writeback: Drop I_DIRTY_TIME_EXPIRE") Signed-off-by: Jaegeuk Kim --- fs/xfs/xfs_trans_inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index 542927321a61..f2ee2b643d53 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -97,9 +97,9 @@ xfs_trans_log_inode( * to log the timestamps, or will clear already cleared fields in the * worst case. */ - if (inode->i_state & (I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED)) { + if (inode->i_state & I_DIRTY_TIME) { spin_lock(&inode->i_lock); - inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED); + inode->i_state &= ~I_DIRTY_TIME; spin_unlock(&inode->i_lock); } From 980631d032c1617e0d054700deb093f44077ceb0 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 16 Jun 2020 09:56:51 -0700 Subject: [PATCH 002/580] f2fs: avoid checkpatch error ERROR:INITIALISED_STATIC: do not initialise statics to NULL Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index e97263bf181b..8587a779ddeb 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -477,7 +477,7 @@ bool f2fs_is_compress_backend_ready(struct inode *inode) return f2fs_cops[F2FS_I(inode)->i_compress_algorithm]; } -static mempool_t *compress_page_pool = NULL; +static mempool_t *compress_page_pool; static int num_compress_pages = 512; module_param(num_compress_pages, uint, 0444); MODULE_PARM_DESC(num_compress_pages, From ac56f794c1a192d42089d3c12259f88dea2b635a Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Wed, 10 Jun 2020 01:14:46 +0300 Subject: [PATCH 003/580] f2fs: use kfree() instead of kvfree() to free superblock data Use kfree() instead of kvfree() to free super in read_raw_super_block() because the memory is allocated with kzalloc() in the function. Use kfree() instead of kvfree() to free sbi, raw_super in f2fs_fill_super() and f2fs_put_super() because the memory is allocated with kzalloc(). Signed-off-by: Denis Efremov Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d9b2b87c9fb5..0ec17eb876d7 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1247,7 +1247,7 @@ static void f2fs_put_super(struct super_block *sb) sb->s_fs_info = NULL; if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); - kvfree(sbi->raw_super); + kfree(sbi->raw_super); destroy_device_list(sbi); f2fs_destroy_xattr_caches(sbi); @@ -1263,7 +1263,7 @@ static void f2fs_put_super(struct super_block *sb) #ifdef CONFIG_UNICODE utf8_unload(sbi->s_encoding); #endif - kvfree(sbi); + kfree(sbi); } int f2fs_sync_fs(struct super_block *sb, int sync) @@ -3159,7 +3159,7 @@ static int read_raw_super_block(struct f2fs_sb_info *sbi, /* No valid superblock */ if (!*raw_super) - kvfree(super); + kfree(super); else err = 0; @@ -3827,11 +3827,11 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) fscrypt_free_dummy_context(&F2FS_OPTION(sbi).dummy_enc_ctx); kvfree(options); free_sb_buf: - kvfree(raw_super); + kfree(raw_super); free_sbi: if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); - kvfree(sbi); + kfree(sbi); /* give only one another chance */ if (retry_cnt > 0 && skip_recovery) { From 878d64604fb7d260026e33cee8bea3f053ca966b Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Mon, 15 Jun 2020 16:11:38 +0800 Subject: [PATCH 004/580] f2fs: remove useless truncate in f2fs_collapse_range() Since offset < new_size, no need to do truncate_pagecache() again with new_size. Signed-off-by: Wei Fang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 90f3945fd988..d7085ec8ee6c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1376,8 +1376,6 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) truncate_pagecache(inode, offset); new_size = i_size_read(inode) - len; - truncate_pagecache(inode, new_size); - ret = f2fs_truncate_blocks(inode, new_size, true); up_write(&F2FS_I(inode)->i_mmap_sem); if (!ret) From 19a0b88d0715b3eb39c4f54ea57d810aa327d072 Mon Sep 17 00:00:00 2001 From: Wuyun Zhao Date: Thu, 18 Jun 2020 10:58:37 +0800 Subject: [PATCH 005/580] f2fs: fix a race condition between f2fs_write_end_io and f2fs_del_fsync_node_entry Under some condition, the __write_node_page will submit a page which is not f2fs_in_warm_node_list and will not call f2fs_add_fsync_node_entry. f2fs_gc continue to run to invoke f2fs_iget -> do_read_inode to read the same node page and set code node, which make f2fs_in_warm_node_list become true, that will cause f2fs_bug_on in f2fs_del_fsync_node_entry when f2fs_write_end_io called. - f2fs_write_end_io - f2fs_iget - do_read_inode - set_cold_node recover cold node flag - f2fs_in_warm_node_list - is_cold_node if node is cold, assume we have added node to fsync_node_list during writepages() - f2fs_del_fsync_node_entry - f2fs_bug_on() due to node page is not in fsync_node_list [ 34.966133] Call trace: [ 34.969902] f2fs_del_fsync_node_entry+0x100/0x108 [ 34.976071] f2fs_write_end_io+0x1e0/0x288 [ 34.981539] bio_endio+0x248/0x270 [ 34.986289] blk_update_request+0x2b0/0x4d8 [ 34.991841] scsi_end_request+0x40/0x440 [ 34.997126] scsi_io_completion+0xa4/0x748 [ 35.002593] scsi_finish_command+0xdc/0x110 [ 35.008143] scsi_softirq_done+0x118/0x150 [ 35.013610] blk_done_softirq+0x8c/0xe8 [ 35.018811] __do_softirq+0x2e8/0x578 [ 35.023828] irq_exit+0xfc/0x120 [ 35.028398] handle_IPI+0x1d8/0x330 [ 35.033233] gic_handle_irq+0x110/0x1d4 [ 35.038433] el1_irq+0xb4/0x130 [ 35.042917] kmem_cache_alloc+0x3f0/0x418 [ 35.048288] radix_tree_node_alloc+0x50/0xf8 [ 35.053933] __radix_tree_create+0xf8/0x188 [ 35.059484] __radix_tree_insert+0x3c/0x128 [ 35.065035] add_gc_inode+0x90/0x118 [ 35.069967] f2fs_gc+0x1b80/0x2d70 [ 35.074718] f2fs_disable_checkpoint+0x94/0x1d0 [ 35.080621] f2fs_fill_super+0x10c4/0x1b88 [ 35.086088] mount_bdev+0x194/0x1e0 [ 35.090923] f2fs_mount+0x40/0x50 [ 35.095589] mount_fs+0xb4/0x190 [ 35.100159] vfs_kern_mount+0x80/0x1d8 [ 35.105260] do_mount+0x478/0xf18 [ 35.109926] ksys_mount+0x90/0xd0 [ 35.114592] __arm64_sys_mount+0x24/0x38 Signed-off-by: Wuyun Zhao Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index be6ac33461d1..0df5c8ca3e5a 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -402,6 +402,7 @@ static int do_read_inode(struct inode *inode) /* try to recover cold bit for non-dir inode */ if (!S_ISDIR(inode->i_mode) && !is_cold_node(node_page)) { + f2fs_wait_on_page_writeback(node_page, NODE, true, true); set_cold_node(node_page, false); set_page_dirty(node_page); } From 070fddbb6bd6020b46f7233ee1c854967b45aa3a Mon Sep 17 00:00:00 2001 From: Jack Qiu Date: Thu, 18 Jun 2020 12:37:10 +0800 Subject: [PATCH 006/580] f2fs: get the right gc victim section when section has several segments Assume each section has 4 segment: .___________________________. |_Segment0_|_..._|_Segment3_| . . . . .__________. |_section0_| Segment 0~2 has 0 valid block, segment 3 has 512 valid blocks. It will fail if we want to gc section0 in this scenes, because all 4 segments in section0 is not dirty. So we should use dirty section bitmap instead of dirty segment bitmap to get right victim section. Signed-off-by: Jack Qiu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 39 +++++++++++++++++------------- fs/f2fs/segment.c | 61 +++++++++++++++++++++++++++++++++++++++++++++-- fs/f2fs/segment.h | 8 +++++-- 3 files changed, 88 insertions(+), 20 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 5b95d5a146eb..9b6fc61ce649 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -21,6 +21,9 @@ #include "gc.h" #include +static unsigned int count_bits(const unsigned long *addr, + unsigned int offset, unsigned int len); + static int gc_thread_func(void *data) { struct f2fs_sb_info *sbi = data; @@ -187,14 +190,20 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, if (p->alloc_mode == SSR) { p->gc_mode = GC_GREEDY; - p->dirty_segmap = dirty_i->dirty_segmap[type]; + p->dirty_bitmap = dirty_i->dirty_segmap[type]; p->max_search = dirty_i->nr_dirty[type]; p->ofs_unit = 1; } else { p->gc_mode = select_gc_type(sbi, gc_type); - p->dirty_segmap = dirty_i->dirty_segmap[DIRTY]; - p->max_search = dirty_i->nr_dirty[DIRTY]; p->ofs_unit = sbi->segs_per_sec; + if (__is_large_section(sbi)) { + p->dirty_bitmap = dirty_i->dirty_secmap; + p->max_search = count_bits(p->dirty_bitmap, + 0, MAIN_SECS(sbi)); + } else { + p->dirty_bitmap = dirty_i->dirty_segmap[DIRTY]; + p->max_search = dirty_i->nr_dirty[DIRTY]; + } } /* @@ -365,10 +374,14 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, } while (1) { - unsigned long cost; - unsigned int segno; + unsigned long cost, *dirty_bitmap; + unsigned int unit_no, segno; - segno = find_next_bit(p.dirty_segmap, last_segment, p.offset); + dirty_bitmap = p.dirty_bitmap; + unit_no = find_next_bit(dirty_bitmap, + last_segment / p.ofs_unit, + p.offset / p.ofs_unit); + segno = unit_no * p.ofs_unit; if (segno >= last_segment) { if (sm->last_victim[p.gc_mode]) { last_segment = @@ -381,14 +394,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, } p.offset = segno + p.ofs_unit; - if (p.ofs_unit > 1) { - p.offset -= segno % p.ofs_unit; - nsearched += count_bits(p.dirty_segmap, - p.offset - p.ofs_unit, - p.ofs_unit); - } else { - nsearched++; - } + nsearched++; #ifdef CONFIG_F2FS_CHECK_FS /* @@ -421,9 +427,10 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, next: if (nsearched >= p.max_search) { if (!sm->last_victim[p.gc_mode] && segno <= last_victim) - sm->last_victim[p.gc_mode] = last_victim + 1; + sm->last_victim[p.gc_mode] = + last_victim + p.ofs_unit; else - sm->last_victim[p.gc_mode] = segno + 1; + sm->last_victim[p.gc_mode] = segno + p.ofs_unit; sm->last_victim[p.gc_mode] %= (MAIN_SECS(sbi) * sbi->segs_per_sec); break; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e4898789c5c0..142e7d6f549c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -796,6 +796,18 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, } if (!test_and_set_bit(segno, dirty_i->dirty_segmap[t])) dirty_i->nr_dirty[t]++; + + if (__is_large_section(sbi)) { + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned short valid_blocks = + get_valid_blocks(sbi, segno, true); + + f2fs_bug_on(sbi, unlikely(!valid_blocks || + valid_blocks == BLKS_PER_SEC(sbi))); + + if (!IS_CURSEC(sbi, secno)) + set_bit(secno, dirty_i->dirty_secmap); + } } } @@ -803,6 +815,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, enum dirty_type dirty_type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned short valid_blocks; if (test_and_clear_bit(segno, dirty_i->dirty_segmap[dirty_type])) dirty_i->nr_dirty[dirty_type]--; @@ -814,13 +827,26 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) dirty_i->nr_dirty[t]--; - if (get_valid_blocks(sbi, segno, true) == 0) { + valid_blocks = get_valid_blocks(sbi, segno, true); + if (valid_blocks == 0) { clear_bit(GET_SEC_FROM_SEG(sbi, segno), dirty_i->victim_secmap); #ifdef CONFIG_F2FS_CHECK_FS clear_bit(segno, SIT_I(sbi)->invalid_segmap); #endif } + if (__is_large_section(sbi)) { + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + + if (!valid_blocks || + valid_blocks == BLKS_PER_SEC(sbi)) { + clear_bit(secno, dirty_i->dirty_secmap); + return; + } + + if (!IS_CURSEC(sbi, secno)) + set_bit(secno, dirty_i->dirty_secmap); + } } } @@ -4292,8 +4318,9 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int segno = 0, offset = 0; + unsigned int segno = 0, offset = 0, secno; unsigned short valid_blocks; + unsigned short blks_per_sec = BLKS_PER_SEC(sbi); while (1) { /* find dirty segment based on free segmap */ @@ -4312,6 +4339,22 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) __locate_dirty_segment(sbi, segno, DIRTY); mutex_unlock(&dirty_i->seglist_lock); } + + if (!__is_large_section(sbi)) + return; + + mutex_lock(&dirty_i->seglist_lock); + for (segno = 0; segno < MAIN_SECS(sbi); segno += blks_per_sec) { + valid_blocks = get_valid_blocks(sbi, segno, true); + secno = GET_SEC_FROM_SEG(sbi, segno); + + if (!valid_blocks || valid_blocks == blks_per_sec) + continue; + if (IS_CURSEC(sbi, secno)) + continue; + set_bit(secno, dirty_i->dirty_secmap); + } + mutex_unlock(&dirty_i->seglist_lock); } static int init_victim_secmap(struct f2fs_sb_info *sbi) @@ -4348,6 +4391,14 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi) return -ENOMEM; } + if (__is_large_section(sbi)) { + bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); + dirty_i->dirty_secmap = f2fs_kvzalloc(sbi, + bitmap_size, GFP_KERNEL); + if (!dirty_i->dirty_secmap) + return -ENOMEM; + } + init_dirty_segmap(sbi); return init_victim_secmap(sbi); } @@ -4517,6 +4568,12 @@ static void destroy_dirty_segmap(struct f2fs_sb_info *sbi) for (i = 0; i < NR_DIRTY_TYPE; i++) discard_dirty_segmap(sbi, i); + if (__is_large_section(sbi)) { + mutex_lock(&dirty_i->seglist_lock); + kvfree(dirty_i->dirty_secmap); + mutex_unlock(&dirty_i->seglist_lock); + } + destroy_victim_secmap(sbi); SM_I(sbi)->dirty_info = NULL; kvfree(dirty_i); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index cba16cca5189..f261e3e6a69b 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -166,8 +166,11 @@ enum { struct victim_sel_policy { int alloc_mode; /* LFS or SSR */ int gc_mode; /* GC_CB or GC_GREEDY */ - unsigned long *dirty_segmap; /* dirty segment bitmap */ - unsigned int max_search; /* maximum # of segments to search */ + unsigned long *dirty_bitmap; /* dirty segment/section bitmap */ + unsigned int max_search; /* + * maximum # of segments/sections + * to search + */ unsigned int offset; /* last scanned bitmap offset */ unsigned int ofs_unit; /* bitmap search unit */ unsigned int min_cost; /* minimum cost */ @@ -266,6 +269,7 @@ enum dirty_type { struct dirty_seglist_info { const struct victim_selection *v_ops; /* victim selction operation */ unsigned long *dirty_segmap[NR_DIRTY_TYPE]; + unsigned long *dirty_secmap; struct mutex seglist_lock; /* lock for segment bitmaps */ int nr_dirty[NR_DIRTY_TYPE]; /* # of dirty segments */ unsigned long *victim_secmap; /* background GC victims */ From 7e77af54a152b974de0d091cad8ff12d4beb7de7 Mon Sep 17 00:00:00 2001 From: Wang Xiaojun Date: Wed, 17 Jun 2020 20:30:12 +0800 Subject: [PATCH 007/580] f2fs: use kfree() to free variables allocated by match_strdup() Use kfree() instead of kvfree() to free variables allocated by match_strdup(). Because the memory is allocated with kmalloc inside match_strdup(). Signed-off-by: Wang Xiaojun Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 0ec17eb876d7..47c589e61684 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -347,7 +347,7 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype, set_opt(sbi, QUOTA); return 0; errout: - kvfree(qname); + kfree(qname); return ret; } @@ -359,7 +359,7 @@ static int f2fs_clear_qf_name(struct super_block *sb, int qtype) f2fs_err(sbi, "Cannot change journaled quota options when quota turned on"); return -EINVAL; } - kvfree(F2FS_OPTION(sbi).s_qf_names[qtype]); + kfree(F2FS_OPTION(sbi).s_qf_names[qtype]); F2FS_OPTION(sbi).s_qf_names[qtype] = NULL; return 0; } @@ -493,10 +493,10 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) } else if (!strcmp(name, "sync")) { F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_SYNC; } else { - kvfree(name); + kfree(name); return -EINVAL; } - kvfree(name); + kfree(name); break; case Opt_disable_roll_forward: set_opt(sbi, DISABLE_ROLL_FORWARD); @@ -653,17 +653,17 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) if (!strcmp(name, "adaptive")) { if (f2fs_sb_has_blkzoned(sbi)) { f2fs_warn(sbi, "adaptive mode is not allowed with zoned block device feature"); - kvfree(name); + kfree(name); return -EINVAL; } F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE; } else if (!strcmp(name, "lfs")) { F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS; } else { - kvfree(name); + kfree(name); return -EINVAL; } - kvfree(name); + kfree(name); break; case Opt_io_size_bits: if (args->from && match_int(args, &arg)) @@ -789,10 +789,10 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) } else if (!strcmp(name, "fs-based")) { F2FS_OPTION(sbi).whint_mode = WHINT_MODE_FS; } else { - kvfree(name); + kfree(name); return -EINVAL; } - kvfree(name); + kfree(name); break; case Opt_alloc: name = match_strdup(&args[0]); @@ -804,10 +804,10 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) } else if (!strcmp(name, "reuse")) { F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; } else { - kvfree(name); + kfree(name); return -EINVAL; } - kvfree(name); + kfree(name); break; case Opt_fsync: name = match_strdup(&args[0]); @@ -821,10 +821,10 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_NOBARRIER; } else { - kvfree(name); + kfree(name); return -EINVAL; } - kvfree(name); + kfree(name); break; case Opt_test_dummy_encryption: ret = f2fs_set_test_dummy_encryption(sb, p, &args[0], @@ -1254,7 +1254,7 @@ static void f2fs_put_super(struct super_block *sb) mempool_destroy(sbi->write_io_dummy); #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) - kvfree(F2FS_OPTION(sbi).s_qf_names[i]); + kfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif fscrypt_free_dummy_context(&F2FS_OPTION(sbi).dummy_enc_ctx); destroy_percpu_info(sbi); @@ -1755,7 +1755,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) GFP_KERNEL); if (!org_mount_opt.s_qf_names[i]) { for (j = 0; j < i; j++) - kvfree(org_mount_opt.s_qf_names[j]); + kfree(org_mount_opt.s_qf_names[j]); return -ENOMEM; } } else { @@ -1880,7 +1880,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) #ifdef CONFIG_QUOTA /* Release old quota file names */ for (i = 0; i < MAXQUOTAS; i++) - kvfree(org_mount_opt.s_qf_names[i]); + kfree(org_mount_opt.s_qf_names[i]); #endif /* Update the POSIXACL Flag */ sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | @@ -1901,7 +1901,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) #ifdef CONFIG_QUOTA F2FS_OPTION(sbi).s_jquota_fmt = org_mount_opt.s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) { - kvfree(F2FS_OPTION(sbi).s_qf_names[i]); + kfree(F2FS_OPTION(sbi).s_qf_names[i]); F2FS_OPTION(sbi).s_qf_names[i] = org_mount_opt.s_qf_names[i]; } #endif @@ -3822,7 +3822,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) free_options: #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) - kvfree(F2FS_OPTION(sbi).s_qf_names[i]); + kfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif fscrypt_free_dummy_context(&F2FS_OPTION(sbi).dummy_enc_ctx); kvfree(options); From 44ad6cc5d9ecf9199bf3f52156be571bc29ef34a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 18 Jun 2020 14:36:22 +0800 Subject: [PATCH 008/580] f2fs: add prefix for exported symbols to avoid polluting global symbol namespace. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 4 ++-- fs/f2fs/data.c | 14 +++++++------- fs/f2fs/f2fs.h | 4 ++-- fs/f2fs/file.c | 4 ++-- fs/f2fs/gc.c | 2 +- fs/f2fs/segment.c | 2 +- 6 files changed, 15 insertions(+), 15 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 8587a779ddeb..d879b8170203 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -920,7 +920,7 @@ static int prepare_compress_overwrite(struct compress_ctx *cc, } if (prealloc) { - __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); + f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); set_new_dnode(&dn, cc->inode, NULL, NULL, 0); @@ -935,7 +935,7 @@ static int prepare_compress_overwrite(struct compress_ctx *cc, break; } - __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); + f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); } if (likely(!ret)) { diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 547c2c8f1a9d..aeb90a4873c5 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1425,7 +1425,7 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) return err; } -void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) +void f2fs_do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) { if (flag == F2FS_GET_BLOCK_PRE_AIO) { if (lock) @@ -1490,7 +1490,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, next_dnode: if (map->m_may_create) - __do_map_lock(sbi, flag, true); + f2fs_do_map_lock(sbi, flag, true); /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -1639,7 +1639,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, f2fs_put_dnode(&dn); if (map->m_may_create) { - __do_map_lock(sbi, flag, false); + f2fs_do_map_lock(sbi, flag, false); f2fs_balance_fs(sbi, dn.node_changed); } goto next_dnode; @@ -1665,7 +1665,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, f2fs_put_dnode(&dn); unlock_out: if (map->m_may_create) { - __do_map_lock(sbi, flag, false); + f2fs_do_map_lock(sbi, flag, false); f2fs_balance_fs(sbi, dn.node_changed); } out: @@ -3226,7 +3226,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, if (f2fs_has_inline_data(inode) || (pos & PAGE_MASK) >= i_size_read(inode)) { - __do_map_lock(sbi, flag, true); + f2fs_do_map_lock(sbi, flag, true); locked = true; } @@ -3263,7 +3263,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err || dn.data_blkaddr == NULL_ADDR) { f2fs_put_dnode(&dn); - __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, + f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); WARN_ON(flag != F2FS_GET_BLOCK_PRE_AIO); locked = true; @@ -3279,7 +3279,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, f2fs_put_dnode(&dn); unlock_out: if (locked) - __do_map_lock(sbi, flag, false); + f2fs_do_map_lock(sbi, flag, false); return err; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ffa0f54a8cbe..050b909c90d4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3331,7 +3331,7 @@ block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi); int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable); void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi); int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); -void allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, +void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, unsigned int start, unsigned int end); void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi, int type); int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range); @@ -3455,7 +3455,7 @@ struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, struct page *f2fs_get_new_data_page(struct inode *inode, struct page *ipage, pgoff_t index, bool new_i_size); int f2fs_do_write_data_page(struct f2fs_io_info *fio); -void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock); +void f2fs_do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock); int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int create, int flag); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index d7085ec8ee6c..f5234a58d055 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -105,11 +105,11 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) if (need_alloc) { /* block allocation */ - __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); + f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_block(&dn, page->index); f2fs_put_dnode(&dn); - __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); + f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); } #ifdef CONFIG_F2FS_FS_COMPRESSION diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 9b6fc61ce649..cb18a99a7f87 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1441,7 +1441,7 @@ static int free_segment_range(struct f2fs_sb_info *sbi, /* Move out cursegs from the target range */ for (type = CURSEG_HOT_DATA; type < NR_CURSEG_TYPE; type++) - allocate_segment_for_resize(sbi, type, start, end); + f2fs_allocate_segment_for_resize(sbi, type, start, end); /* do GC to move out valid blocks in the range */ for (segno = start; segno <= end; segno += sbi->segs_per_sec) { diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 142e7d6f549c..e009e6676e63 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2699,7 +2699,7 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, stat_inc_seg_type(sbi, curseg); } -void allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, +void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, unsigned int start, unsigned int end) { struct curseg_info *curseg = CURSEG_I(sbi, type); From 07f7e318444e7798b00be70deb1ee9b9b07d6437 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 18 Jun 2020 14:36:23 +0800 Subject: [PATCH 009/580] f2fs: shrink node_write lock coverage - to avoid race between checkpoint and quota file writeback, it just needs to hold read lock of node_write in writeback path. - node_write lock has covered all LFS data write paths, it's not necessary, we only need to hold node_write lock at write path of quota file. This refactors commit ca7f76e68074 ("f2fs: fix wrong discard space"). Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 18 +++++++++++++++--- fs/f2fs/data.c | 12 ++++++++++++ fs/f2fs/segment.c | 11 ----------- 3 files changed, 27 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index d879b8170203..e2b26b4f0b6a 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1067,8 +1067,16 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, loff_t psize; int i, err; - if (!IS_NOQUOTA(inode) && !f2fs_trylock_op(sbi)) + if (IS_NOQUOTA(inode)) { + /* + * We need to wait for node_write to avoid block allocation during + * checkpoint. This can only happen to quota writes which can cause + * the below discard race condition. + */ + down_read(&sbi->node_write); + } else if (!f2fs_trylock_op(sbi)) { return -EAGAIN; + } set_new_dnode(&dn, cc->inode, NULL, NULL, 0); @@ -1174,7 +1182,9 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); f2fs_put_dnode(&dn); - if (!IS_NOQUOTA(inode)) + if (IS_NOQUOTA(inode)) + up_read(&sbi->node_write); + else f2fs_unlock_op(sbi); spin_lock(&fi->i_size_lock); @@ -1201,7 +1211,9 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, out_put_dnode: f2fs_put_dnode(&dn); out_unlock_op: - if (!IS_NOQUOTA(inode)) + if (IS_NOQUOTA(inode)) + up_read(&sbi->node_write); + else f2fs_unlock_op(sbi); return -EAGAIN; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index aeb90a4873c5..fd3ccd2fdf78 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2728,8 +2728,20 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, /* Dentry/quota blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode) || IS_NOQUOTA(inode)) { + /* + * We need to wait for node_write to avoid block allocation during + * checkpoint. This can only happen to quota writes which can cause + * the below discard race condition. + */ + if (IS_NOQUOTA(inode)) + down_read(&sbi->node_write); + fio.need_lock = LOCK_DONE; err = f2fs_do_write_data_page(&fio); + + if (IS_NOQUOTA(inode)) + up_read(&sbi->node_write); + goto done; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e009e6676e63..52321382b391 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3132,14 +3132,6 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, type = CURSEG_COLD_DATA; } - /* - * We need to wait for node_write to avoid block allocation during - * checkpoint. This can only happen to quota writes which can cause - * the below discard race condition. - */ - if (IS_DATASEG(type)) - down_write(&sbi->node_write); - down_read(&SM_I(sbi)->curseg_lock); mutex_lock(&curseg->curseg_mutex); @@ -3205,9 +3197,6 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, up_read(&SM_I(sbi)->curseg_lock); - if (IS_DATASEG(type)) - up_write(&sbi->node_write); - if (put_pin_sem) up_read(&sbi->pin_sem); } From a87cc9e6a3172019bdb9bb82e7d0b78bb011b7f6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 18 Jun 2020 14:36:24 +0800 Subject: [PATCH 010/580] f2fs: clean up parameter of f2fs_allocate_data_block() Use validation of @fio to inidcate whether caller want to serialize IOs in io.io_list or not, then @add_list will be redundant, remove it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 2 +- fs/f2fs/gc.c | 2 +- fs/f2fs/segment.c | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index fd3ccd2fdf78..2be7b43cce49 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1365,7 +1365,7 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); old_blkaddr = dn->data_blkaddr; f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr, - &sum, seg_type, NULL, false); + &sum, seg_type, NULL); if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) invalidate_mapping_pages(META_MAPPING(sbi), old_blkaddr, old_blkaddr); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 050b909c90d4..dce88f055011 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3356,7 +3356,7 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, - struct f2fs_io_info *fio, bool add_list); + struct f2fs_io_info *fio); void f2fs_wait_on_page_writeback(struct page *page, enum page_type type, bool ordered, bool locked); void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index cb18a99a7f87..5e1368ce2389 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -866,7 +866,7 @@ static int move_data_block(struct inode *inode, block_t bidx, } f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, - &sum, CURSEG_COLD_DATA, NULL, false); + &sum, CURSEG_COLD_DATA, NULL); fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi), newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 52321382b391..5e8e5e848393 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3114,7 +3114,7 @@ static int __get_segment_type(struct f2fs_io_info *fio) void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, - struct f2fs_io_info *fio, bool add_list) + struct f2fs_io_info *fio) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -3182,7 +3182,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, if (F2FS_IO_ALIGNED(sbi)) fio->retry = false; - if (add_list) { + if (fio) { struct f2fs_bio_info *io; INIT_LIST_HEAD(&fio->list); @@ -3231,7 +3231,7 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) down_read(&fio->sbi->io_order_lock); reallocate: f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, - &fio->new_blkaddr, sum, type, fio, true); + &fio->new_blkaddr, sum, type, fio); if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) invalidate_mapping_pages(META_MAPPING(fio->sbi), fio->old_blkaddr, fio->old_blkaddr); From 7536a915e0a8841be010973970ea7a35893f08c3 Mon Sep 17 00:00:00 2001 From: Qilong Zhang Date: Sun, 28 Jun 2020 19:23:03 +0800 Subject: [PATCH 011/580] f2fs: add f2fs_gc exception handle in f2fs_ioc_gc_range When f2fs_ioc_gc_range performs multiple segments gc ops, the return value of f2fs_ioc_gc_range is determined by the last segment gc ops. If its ops failed, the f2fs_ioc_gc_range will be considered to be failed despite some of previous segments gc ops succeeded. Therefore, so we fix: Redefine the return value of getting victim ops and add exception handle for f2fs_gc. In particular, 1).if target has no valid block, it will go on. 2).if target sectoion has valid block(s), but it is current section, we will reminder the caller. Signed-off-by: Qilong Zhang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 +++++ fs/f2fs/gc.c | 20 ++++++++++++++------ fs/f2fs/segment.c | 4 ++-- 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f5234a58d055..3fd1b458ee2e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2528,6 +2528,11 @@ static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) } ret = f2fs_gc(sbi, range.sync, true, GET_SEGNO(sbi, range.start)); + if (ret) { + if (ret == -EBUSY) + ret = -EAGAIN; + goto out; + } range.start += BLKS_PER_SEC(sbi); if (range.start <= end) goto do_more; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 5e1368ce2389..6eec3b2d606d 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -330,6 +330,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, unsigned int secno, last_victim; unsigned int last_segment; unsigned int nsearched = 0; + int ret = 0; mutex_lock(&dirty_i->seglist_lock); last_segment = MAIN_SECS(sbi) * sbi->segs_per_sec; @@ -341,12 +342,19 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, p.min_cost = get_max_cost(sbi, &p); if (*result != NULL_SEGNO) { - if (get_valid_blocks(sbi, *result, false) && - !sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result))) + if (!get_valid_blocks(sbi, *result, false)) { + ret = -ENODATA; + goto out; + } + + if (sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result))) + ret = -EBUSY; + else p.min_segno = *result; goto out; } + ret = -ENODATA; if (p.max_search == 0) goto out; @@ -447,6 +455,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, else set_bit(secno, dirty_i->victim_secmap); } + ret = 0; } out: @@ -456,7 +465,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, prefree_segments(sbi), free_segments(sbi)); mutex_unlock(&dirty_i->seglist_lock); - return (p.min_segno == NULL_SEGNO) ? 0 : 1; + return ret; } static const struct victim_selection default_v_ops = { @@ -1340,10 +1349,9 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, ret = -EINVAL; goto stop; } - if (!__get_victim(sbi, &segno, gc_type)) { - ret = -ENODATA; + ret = __get_victim(sbi, &segno, gc_type); + if (ret) goto stop; - } seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type); if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5e8e5e848393..3ac4052b1c7d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2630,7 +2630,7 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) bool reversed = false; /* f2fs_need_SSR() already forces to do this */ - if (v_ops->get_victim(sbi, &segno, BG_GC, type, SSR)) { + if (!v_ops->get_victim(sbi, &segno, BG_GC, type, SSR)) { curseg->next_segno = segno; return 1; } @@ -2657,7 +2657,7 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) for (; cnt-- > 0; reversed ? i-- : i++) { if (i == type) continue; - if (v_ops->get_victim(sbi, &segno, BG_GC, i, SSR)) { + if (!v_ops->get_victim(sbi, &segno, BG_GC, i, SSR)) { curseg->next_segno = segno; return 1; } From a844980870f53706ffbeca82e073df996bf1fdc4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 28 Jun 2020 10:58:44 +0800 Subject: [PATCH 012/580] f2fs: show more debug info for per-temperature log - Add to account and show per-log dirty_seg, full_seg and valid_blocks in debugfs. - reformat printed info. TYPE segno secno zoneno dirty_seg full_seg valid_blk - COLD data: 1523 1523 1523 1 0 399 - WARM data: 769 769 769 20 255 133098 - HOT data: 767 767 767 9 0 167 - Dir dnode: 22 22 22 3 0 70 - File dnode: 722 722 722 14 10 6505 - Indir nodes: 2 2 2 1 0 3 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 64 +++++++++++++++++++++++++++++++++++++++---------- fs/f2fs/f2fs.h | 3 +++ 2 files changed, 55 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 0dbcb0f9c019..4276c0f79beb 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -174,6 +174,26 @@ static void update_general_status(struct f2fs_sb_info *sbi) for (i = META_CP; i < META_MAX; i++) si->meta_count[i] = atomic_read(&sbi->meta_count[i]); + for (i = 0; i < NO_CHECK_TYPE; i++) { + si->dirty_seg[i] = 0; + si->full_seg[i] = 0; + si->valid_blks[i] = 0; + } + + for (i = 0; i < MAIN_SEGS(sbi); i++) { + int blks = get_seg_entry(sbi, i)->valid_blocks; + int type = get_seg_entry(sbi, i)->type; + + if (!blks) + continue; + + if (blks == sbi->blocks_per_seg) + si->full_seg[type]++; + else + si->dirty_seg[type]++; + si->valid_blks[type] += blks; + } + for (i = 0; i < 2; i++) { si->segment_count[i] = sbi->segment_count[i]; si->block_count[i] = sbi->block_count[i]; @@ -329,30 +349,50 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", si->main_area_segs, si->main_area_sections, si->main_area_zones); - seq_printf(s, " - COLD data: %d, %d, %d\n", + seq_printf(s, " TYPE %8s %8s %8s %10s %10s %10s\n", + "segno", "secno", "zoneno", "dirty_seg", "full_seg", "valid_blk"); + seq_printf(s, " - COLD data: %8d %8d %8d %10u %10u %10u\n", si->curseg[CURSEG_COLD_DATA], si->cursec[CURSEG_COLD_DATA], - si->curzone[CURSEG_COLD_DATA]); - seq_printf(s, " - WARM data: %d, %d, %d\n", + si->curzone[CURSEG_COLD_DATA], + si->dirty_seg[CURSEG_COLD_DATA], + si->full_seg[CURSEG_COLD_DATA], + si->valid_blks[CURSEG_COLD_DATA]); + seq_printf(s, " - WARM data: %8d %8d %8d %10u %10u %10u\n", si->curseg[CURSEG_WARM_DATA], si->cursec[CURSEG_WARM_DATA], - si->curzone[CURSEG_WARM_DATA]); - seq_printf(s, " - HOT data: %d, %d, %d\n", + si->curzone[CURSEG_WARM_DATA], + si->dirty_seg[CURSEG_WARM_DATA], + si->full_seg[CURSEG_WARM_DATA], + si->valid_blks[CURSEG_WARM_DATA]); + seq_printf(s, " - HOT data: %8d %8d %8d %10u %10u %10u\n", si->curseg[CURSEG_HOT_DATA], si->cursec[CURSEG_HOT_DATA], - si->curzone[CURSEG_HOT_DATA]); - seq_printf(s, " - Dir dnode: %d, %d, %d\n", + si->curzone[CURSEG_HOT_DATA], + si->dirty_seg[CURSEG_HOT_DATA], + si->full_seg[CURSEG_HOT_DATA], + si->valid_blks[CURSEG_HOT_DATA]); + seq_printf(s, " - Dir dnode: %8d %8d %8d %10u %10u %10u\n", si->curseg[CURSEG_HOT_NODE], si->cursec[CURSEG_HOT_NODE], - si->curzone[CURSEG_HOT_NODE]); - seq_printf(s, " - File dnode: %d, %d, %d\n", + si->curzone[CURSEG_HOT_NODE], + si->dirty_seg[CURSEG_HOT_NODE], + si->full_seg[CURSEG_HOT_NODE], + si->valid_blks[CURSEG_HOT_NODE]); + seq_printf(s, " - File dnode: %8d %8d %8d %10u %10u %10u\n", si->curseg[CURSEG_WARM_NODE], si->cursec[CURSEG_WARM_NODE], - si->curzone[CURSEG_WARM_NODE]); - seq_printf(s, " - Indir nodes: %d, %d, %d\n", + si->curzone[CURSEG_WARM_NODE], + si->dirty_seg[CURSEG_WARM_NODE], + si->full_seg[CURSEG_WARM_NODE], + si->valid_blks[CURSEG_WARM_NODE]); + seq_printf(s, " - Indir nodes: %8d %8d %8d %10u %10u %10u\n", si->curseg[CURSEG_COLD_NODE], si->cursec[CURSEG_COLD_NODE], - si->curzone[CURSEG_COLD_NODE]); + si->curzone[CURSEG_COLD_NODE], + si->dirty_seg[CURSEG_COLD_NODE], + si->full_seg[CURSEG_COLD_NODE], + si->valid_blks[CURSEG_COLD_NODE]); seq_printf(s, "\n - Valid: %d\n - Dirty: %d\n", si->main_area_segs - si->dirty_count - si->prefree_count - si->free_segs, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index dce88f055011..2a167abf0558 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3543,6 +3543,9 @@ struct f2fs_stat_info { int curseg[NR_CURSEG_TYPE]; int cursec[NR_CURSEG_TYPE]; int curzone[NR_CURSEG_TYPE]; + unsigned int dirty_seg[NR_CURSEG_TYPE]; + unsigned int full_seg[NR_CURSEG_TYPE]; + unsigned int valid_blks[NR_CURSEG_TYPE]; unsigned int meta_count[META_MAX]; unsigned int segment_count[2]; From a151330eadb8f33798fb133323bdf6acc25dbef1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 28 Jun 2020 10:58:17 +0800 Subject: [PATCH 013/580] f2fs: fix to wait page writeback before update Filesystem including f2fs should support stable page for special device like software raid, however there is one missing path that page could be updated while it is writeback state as below, fix this. - gc_node_segment - f2fs_move_node_page - __write_node_page - set_page_writeback - do_read_inode - f2fs_init_extent_tree - __f2fs_init_extent_tree i_ext->len = 0; Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 18 +++++++++--------- fs/f2fs/f2fs.h | 2 +- fs/f2fs/inode.c | 3 +-- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index e60078460ad1..686c68b98610 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -325,9 +325,10 @@ static void __drop_largest_extent(struct extent_tree *et, } /* return true, if inode page is changed */ -static bool __f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) +static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_extent *i_ext = ipage ? &F2FS_INODE(ipage)->i_ext : NULL; struct extent_tree *et; struct extent_node *en; struct extent_info ei; @@ -335,16 +336,18 @@ static bool __f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_e if (!f2fs_may_extent_tree(inode)) { /* drop largest extent */ if (i_ext && i_ext->len) { + f2fs_wait_on_page_writeback(ipage, NODE, true, true); i_ext->len = 0; - return true; + set_page_dirty(ipage); + return; } - return false; + return; } et = __grab_extent_tree(inode); if (!i_ext || !i_ext->len) - return false; + return; get_extent_info(&ei, i_ext); @@ -360,17 +363,14 @@ static bool __f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_e } out: write_unlock(&et->lock); - return false; } -bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) +void f2fs_init_extent_tree(struct inode *inode, struct page *ipage) { - bool ret = __f2fs_init_extent_tree(inode, i_ext); + __f2fs_init_extent_tree(inode, ipage); if (!F2FS_I(inode)->extent_tree) set_inode_flag(inode, FI_NO_EXTENT); - - return ret; } static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2a167abf0558..2770f1c830f9 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3805,7 +3805,7 @@ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, struct rb_root_cached *root); unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); -bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext); +void f2fs_init_extent_tree(struct inode *inode, struct page *ipage); void f2fs_drop_extent_tree(struct inode *inode); unsigned int f2fs_destroy_extent_node(struct inode *inode); void f2fs_destroy_extent_tree(struct inode *inode); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 0df5c8ca3e5a..1a44b43518c8 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -367,8 +367,7 @@ static int do_read_inode(struct inode *inode) fi->i_pino = le32_to_cpu(ri->i_pino); fi->i_dir_level = ri->i_dir_level; - if (f2fs_init_extent_tree(inode, &ri->i_ext)) - set_page_dirty(node_page); + f2fs_init_extent_tree(inode, node_page); get_inline_info(inode, ri); From 42bd1b1fccdc427fb4c3478149e894b37fae9586 Mon Sep 17 00:00:00 2001 From: Liu Song Date: Thu, 25 Jun 2020 20:40:11 +0800 Subject: [PATCH 014/580] f2fs: fix typo in comment of f2fs_do_add_link stakable/stackable Signed-off-by: Liu Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index a02ffa06d9df..d559016668ab 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -779,7 +779,7 @@ int f2fs_do_add_link(struct inode *dir, const struct qstr *name, return err; /* - * An immature stakable filesystem shows a race condition between lookup + * An immature stackable filesystem shows a race condition between lookup * and create. If we have same task when doing lookup and create, it's * definitely fine as expected by VFS normally. Otherwise, let's just * verify on-disk dentry one more time, which guarantees filesystem From 0d30b584ef108491d2bb32516a3315a5165598d0 Mon Sep 17 00:00:00 2001 From: Liu Song Date: Sun, 28 Jun 2020 21:48:13 +0800 Subject: [PATCH 015/580] f2fs: remove useless parameter of __insert_free_nid() In current version, @state will only be FREE_NID. This parameter has no real effect so remove it to keep clean. Signed-off-by: Liu Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index bd9da9de9b17..c8e53bea3b82 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2108,7 +2108,7 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, } static int __insert_free_nid(struct f2fs_sb_info *sbi, - struct free_nid *i, enum nid_state state) + struct free_nid *i) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -2116,10 +2116,8 @@ static int __insert_free_nid(struct f2fs_sb_info *sbi, if (err) return err; - f2fs_bug_on(sbi, state != i->state); - nm_i->nid_cnt[state]++; - if (state == FREE_NID) - list_add_tail(&i->list, &nm_i->free_nid_list); + nm_i->nid_cnt[FREE_NID]++; + list_add_tail(&i->list, &nm_i->free_nid_list); return 0; } @@ -2241,7 +2239,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, } } ret = true; - err = __insert_free_nid(sbi, i, FREE_NID); + err = __insert_free_nid(sbi, i); err_out: if (update) { update_free_nid_bitmap(sbi, nid, ret, build); From c3e98ae01ef091eb197418fc583a57f410f92597 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 28 Jun 2020 20:29:38 +0800 Subject: [PATCH 016/580] f2fs: fix wrong return value of f2fs_bmap_compress() If compression is disable, we should return zero rather than -EOPNOTSUPP to indicate f2fs_bmap() is not supported. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 2be7b43cce49..7aa72d3832f6 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3712,10 +3712,9 @@ static sector_t f2fs_bmap_compress(struct inode *inode, sector_t block) } f2fs_put_dnode(&dn); - return blknr; #else - return -EOPNOTSUPP; + return 0; #endif } From a2dfd582e8caaaeccbd2f685fbff881a35f27484 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 29 Jun 2020 20:13:12 +0800 Subject: [PATCH 017/580] f2fs: support to trace f2fs_bmap() to show f2fs_bmap()'s result as below: f2fs_bmap: dev = (251,0), ino = 7, lblock:0, pblock:396800 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 14 +++++++++++--- include/trace/events/f2fs.h | 26 ++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7aa72d3832f6..7095e1619655 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3722,18 +3722,26 @@ static sector_t f2fs_bmap_compress(struct inode *inode, sector_t block) static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; + struct buffer_head tmp = { + .b_size = i_blocksize(inode), + }; + sector_t blknr = 0; if (f2fs_has_inline_data(inode)) - return 0; + goto out; /* make sure allocating whole blocks */ if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) filemap_write_and_wait(mapping); if (f2fs_compressed_file(inode)) - return f2fs_bmap_compress(inode, block); + blknr = f2fs_bmap_compress(inode, block); - return generic_block_bmap(mapping, block, get_data_block_bmap); + if (!get_data_block_bmap(inode, block, &tmp, 0)) + blknr = tmp.b_blocknr; +out: + trace_f2fs_bmap(inode, block, blknr); + return blknr; } #ifdef CONFIG_MIGRATION diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 3c7c018a5740..3bd742d30e30 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1900,6 +1900,32 @@ TRACE_EVENT(f2fs_iostat, __entry->fs_cdrio, __entry->fs_nrio, __entry->fs_mrio) ); +TRACE_EVENT(f2fs_bmap, + + TP_PROTO(struct inode *inode, sector_t lblock, sector_t pblock), + + TP_ARGS(inode, lblock, pblock), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(sector_t, lblock) + __field(sector_t, pblock) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->lblock = lblock; + __entry->pblock = pblock; + ), + + TP_printk("dev = (%d,%d), ino = %lu, lblock:%lld, pblock:%lld", + show_dev_ino(__entry), + (unsigned long long)__entry->lblock, + (unsigned long long)__entry->pblock) +); + #endif /* _TRACE_F2FS_H */ /* This part must be outside protection */ From 01ea99db7f2453d97238d2a2c8063a37d33095b6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 29 Jun 2020 20:13:13 +0800 Subject: [PATCH 018/580] f2fs: support to trace f2fs_fiemap() to show f2fs_fiemap()'s result as below: f2fs_fiemap: dev = (251,0), ino = 7, lblock:0, pblock:1625292800, len:2097152, flags:0, ret:0 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 6 +++++- fs/f2fs/inline.c | 2 ++ include/trace/events/f2fs.h | 37 +++++++++++++++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7095e1619655..09db480b58d5 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1812,6 +1812,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, flags |= FIEMAP_EXTENT_LAST; err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags); + trace_f2fs_fiemap(inode, 0, phys, len, flags, err); if (err || err == 1) return err; } @@ -1835,8 +1836,10 @@ static int f2fs_xattr_fiemap(struct inode *inode, flags = FIEMAP_EXTENT_LAST; } - if (phys) + if (phys) { err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags); + trace_f2fs_fiemap(inode, 0, phys, len, flags, err); + } return (err < 0 ? err : 0); } @@ -1930,6 +1933,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, ret = fiemap_fill_next_extent(fieinfo, logical, phys, size, flags); + trace_f2fs_fiemap(inode, logical, phys, size, flags, ret); if (ret) goto out; size = 0; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 6e497598c069..652b972dfd7b 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -11,6 +11,7 @@ #include "f2fs.h" #include "node.h" +#include bool f2fs_may_inline_data(struct inode *inode) { @@ -775,6 +776,7 @@ int f2fs_inline_data_fiemap(struct inode *inode, byteaddr += (char *)inline_data_addr(inode, ipage) - (char *)F2FS_INODE(ipage); err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags); + trace_f2fs_fiemap(inode, start, byteaddr, ilen, flags, err); out: f2fs_put_page(ipage, 1); return err; diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 3bd742d30e30..9970c25942b5 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1926,6 +1926,43 @@ TRACE_EVENT(f2fs_bmap, (unsigned long long)__entry->pblock) ); +TRACE_EVENT(f2fs_fiemap, + + TP_PROTO(struct inode *inode, sector_t lblock, sector_t pblock, + unsigned long long len, unsigned int flags, int ret), + + TP_ARGS(inode, lblock, pblock, len, flags, ret), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(sector_t, lblock) + __field(sector_t, pblock) + __field(unsigned long long, len) + __field(unsigned int, flags) + __field(int, ret) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->lblock = lblock; + __entry->pblock = pblock; + __entry->len = len; + __entry->flags = flags; + __entry->ret = ret; + ), + + TP_printk("dev = (%d,%d), ino = %lu, lblock:%lld, pblock:%lld, " + "len:%llu, flags:%u, ret:%d", + show_dev_ino(__entry), + (unsigned long long)__entry->lblock, + (unsigned long long)__entry->pblock, + __entry->len, + __entry->flags, + __entry->ret) +); + #endif /* _TRACE_F2FS_H */ /* This part must be outside protection */ From c8fb9822673676e9517cdd907fe1834b59d07c40 Mon Sep 17 00:00:00 2001 From: Wang Xiaojun Date: Tue, 16 Jun 2020 10:39:24 +0800 Subject: [PATCH 019/580] f2fs: remove the unused compr parameter The parameter compr is unused in the f2fs_cluster_blocks function so we no longer need to pass it as a parameter. Signed-off-by: Wang Xiaojun Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index e2b26b4f0b6a..31414ff6fade 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -792,7 +792,7 @@ static int f2fs_compressed_blocks(struct compress_ctx *cc) } /* return # of valid blocks in compressed cluster */ -static int f2fs_cluster_blocks(struct compress_ctx *cc, bool compr) +static int f2fs_cluster_blocks(struct compress_ctx *cc) { return __f2fs_cluster_blocks(cc, false); } @@ -806,7 +806,7 @@ int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index) .cluster_idx = index >> F2FS_I(inode)->i_log_cluster_size, }; - return f2fs_cluster_blocks(&cc, false); + return f2fs_cluster_blocks(&cc); } static bool cluster_may_compress(struct compress_ctx *cc) @@ -857,7 +857,7 @@ static int prepare_compress_overwrite(struct compress_ctx *cc, bool prealloc; retry: - ret = f2fs_cluster_blocks(cc, false); + ret = f2fs_cluster_blocks(cc); if (ret <= 0) return ret; From a25158dce3bdea515a01bf392140b81a10003a73 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 19 Jun 2020 17:14:19 +0800 Subject: [PATCH 020/580] f2fs: fix to check page dirty status before writeback In f2fs_write_raw_pages(), we need to check page dirty status before writeback, because there could be a racer (e.g. reclaimer) helps writebacking the dirty page. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 31414ff6fade..1460bd4748cc 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1293,6 +1293,12 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc, congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); lock_page(cc->rpages[i]); + + if (!PageDirty(cc->rpages[i])) { + unlock_page(cc->rpages[i]); + continue; + } + clear_page_dirty_for_io(cc->rpages[i]); goto retry_write; } From 15e6747755969c7c47aeeed3f7e461a08c87d8fc Mon Sep 17 00:00:00 2001 From: Lihong Kou Date: Sat, 20 Jun 2020 10:12:17 +0800 Subject: [PATCH 021/580] f2fs: make trace enter and end in pairs for unlink In the f2fs_unlink we do not add trace end for some error paths, just add. Signed-off-by: Lihong Kou Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index e94e02c6580a..a15a2831d43b 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -569,15 +569,17 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) trace_f2fs_unlink_enter(dir, dentry); - if (unlikely(f2fs_cp_error(sbi))) - return -EIO; + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; + goto fail; + } err = dquot_initialize(dir); if (err) - return err; + goto fail; err = dquot_initialize(inode); if (err) - return err; + goto fail; de = f2fs_find_entry(dir, &dentry->d_name, &page); if (!de) { From e2970c134fb7f952d52018f72aba070b8ef5c2cd Mon Sep 17 00:00:00 2001 From: Yu Changchun Date: Sat, 20 Jun 2020 03:58:29 -0400 Subject: [PATCH 022/580] f2fs: fix an oops in f2fs_is_compressed_page This patch is to fix a crash: #3 [ffffb6580689f898] oops_end at ffffffffa2835bc2 #4 [ffffb6580689f8b8] no_context at ffffffffa28766e7 #5 [ffffb6580689f920] async_page_fault at ffffffffa320135e [exception RIP: f2fs_is_compressed_page+34] RIP: ffffffffa2ba83a2 RSP: ffffb6580689f9d8 RFLAGS: 00010213 RAX: 0000000000000001 RBX: fffffc0f50b34bc0 RCX: 0000000000002122 RDX: 0000000000002123 RSI: 0000000000000c00 RDI: fffffc0f50b34bc0 RBP: ffff97e815a40178 R8: 0000000000000000 R9: ffff97e83ffc9000 R10: 0000000000032300 R11: 0000000000032380 R12: ffffb6580689fa38 R13: fffffc0f50b34bc0 R14: ffff97e825cbd000 R15: 0000000000000c00 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #6 [ffffb6580689f9d8] __is_cp_guaranteed at ffffffffa2b7ea98 #7 [ffffb6580689f9f0] f2fs_submit_page_write at ffffffffa2b81a69 #8 [ffffb6580689fa30] f2fs_do_write_meta_page at ffffffffa2b99777 #9 [ffffb6580689fae0] __f2fs_write_meta_page at ffffffffa2b75f1a #10 [ffffb6580689fb18] f2fs_sync_meta_pages at ffffffffa2b77466 #11 [ffffb6580689fc98] do_checkpoint at ffffffffa2b78e46 #12 [ffffb6580689fd88] f2fs_write_checkpoint at ffffffffa2b79c29 #13 [ffffb6580689fdd0] f2fs_sync_fs at ffffffffa2b69d95 #14 [ffffb6580689fe20] sync_filesystem at ffffffffa2ad2574 #15 [ffffb6580689fe30] generic_shutdown_super at ffffffffa2a9b582 #16 [ffffb6580689fe48] kill_block_super at ffffffffa2a9b6d1 #17 [ffffb6580689fe60] kill_f2fs_super at ffffffffa2b6abe1 #18 [ffffb6580689fea0] deactivate_locked_super at ffffffffa2a9afb6 #19 [ffffb6580689feb8] cleanup_mnt at ffffffffa2abcad4 #20 [ffffb6580689fee0] task_work_run at ffffffffa28bca28 #21 [ffffb6580689ff00] exit_to_usermode_loop at ffffffffa28050b7 #22 [ffffb6580689ff38] do_syscall_64 at ffffffffa280560e #23 [ffffb6580689ff50] entry_SYSCALL_64_after_hwframe at ffffffffa320008c This occurred when umount f2fs if enable F2FS_FS_COMPRESSION with F2FS_IO_TRACE. Fixes it by adding IS_IO_TRACED_PAGE to check validity of pid for page_private. Signed-off-by: Yu Changchun Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 7 +++++++ fs/f2fs/f2fs.h | 8 ++++++++ 2 files changed, 15 insertions(+) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 1460bd4748cc..939fd3af5a17 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -50,6 +50,13 @@ bool f2fs_is_compressed_page(struct page *page) return false; if (IS_ATOMIC_WRITTEN_PAGE(page) || IS_DUMMY_WRITTEN_PAGE(page)) return false; + /* + * page->private may be set with pid. + * pid_max is enough to check if it is traced. + */ + if (IS_IO_TRACED_PAGE(page)) + return false; + f2fs_bug_on(F2FS_M_SB(page->mapping), *((u32 *)page_private(page)) != F2FS_COMPRESSED_PAGE_MAGIC); return true; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2770f1c830f9..13b4fbae16dd 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1312,6 +1312,14 @@ enum fsync_mode { #define IS_DUMMY_WRITTEN_PAGE(page) \ (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) +#ifdef CONFIG_F2FS_IO_TRACE +#define IS_IO_TRACED_PAGE(page) \ + (page_private(page) > 0 && \ + page_private(page) < (unsigned long)PID_MAX_LIMIT) +#else +#define IS_IO_TRACED_PAGE(page) (0) +#endif + #ifdef CONFIG_FS_ENCRYPTION #define DUMMY_ENCRYPTION_ENABLED(sbi) \ (unlikely(F2FS_OPTION(sbi).dummy_enc_ctx.ctx != NULL)) From 3bc61acd500b185e12da8fae56549fcf64cfeb08 Mon Sep 17 00:00:00 2001 From: Yubo Feng Date: Sat, 20 Jun 2020 16:39:43 +0800 Subject: [PATCH 023/580] f2fs: lost matching-pair of trace in f2fs_truncate_inode_blocks if get_node_path() return -E2BIG and trace of f2fs_truncate_inode_blocks_enter/exit enabled then the matching-pair of trace_exit will lost in log. Signed-off-by: Yubo Feng Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index c8e53bea3b82..4c5dd66351de 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1039,8 +1039,10 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from) trace_f2fs_truncate_inode_blocks_enter(inode, from); level = get_node_path(inode, from, offset, noffset); - if (level < 0) + if (level < 0) { + trace_f2fs_truncate_inode_blocks_exit(inode, level); return level; + } page = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) { From 7c92a6a9f1d6370fb531ee20658df1e5dc096305 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 22 Jun 2020 17:38:48 +0800 Subject: [PATCH 024/580] f2fs: split f2fs_allocate_new_segments() to two independent functions: - f2fs_allocate_new_segment() for specified type segment allocation - f2fs_allocate_new_segments() for all data type segments allocation Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 ++- fs/f2fs/file.c | 2 +- fs/f2fs/recovery.c | 2 +- fs/f2fs/segment.c | 41 ++++++++++++++++++++++++----------------- 4 files changed, 28 insertions(+), 20 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 13b4fbae16dd..61936f3f02c2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3341,7 +3341,8 @@ void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi); int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, unsigned int start, unsigned int end); -void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi, int type); +void f2fs_allocate_new_segment(struct f2fs_sb_info *sbi, int type); +void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi); int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range); bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 3fd1b458ee2e..a0add6b1bbc4 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1661,7 +1661,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset, map.m_seg_type = CURSEG_COLD_DATA_PINNED; f2fs_lock_op(sbi); - f2fs_allocate_new_segments(sbi, CURSEG_COLD_DATA); + f2fs_allocate_new_segment(sbi, CURSEG_COLD_DATA); f2fs_unlock_op(sbi); err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 9f25027123bb..6fc0d085e460 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -742,7 +742,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, f2fs_put_page(page, 1); } if (!err) - f2fs_allocate_new_segments(sbi, NO_CHECK_TYPE); + f2fs_allocate_new_segments(sbi); return err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 3ac4052b1c7d..d87ed33b64a3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2732,28 +2732,35 @@ void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, up_read(&SM_I(sbi)->curseg_lock); } -void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi, int type) +static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type) { - struct curseg_info *curseg; + struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int old_segno; + + if (!curseg->next_blkoff && + !get_valid_blocks(sbi, curseg->segno, false) && + !get_ckpt_valid_blocks(sbi, curseg->segno)) + return; + + old_segno = curseg->segno; + SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true); + locate_dirty_segment(sbi, old_segno); +} + +void f2fs_allocate_new_segment(struct f2fs_sb_info *sbi, int type) +{ + down_write(&SIT_I(sbi)->sentry_lock); + __allocate_new_segment(sbi, type); + up_write(&SIT_I(sbi)->sentry_lock); +} + +void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) +{ int i; down_write(&SIT_I(sbi)->sentry_lock); - - for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { - if (type != NO_CHECK_TYPE && i != type) - continue; - - curseg = CURSEG_I(sbi, i); - if (type == NO_CHECK_TYPE || curseg->next_blkoff || - get_valid_blocks(sbi, curseg->segno, false) || - get_ckpt_valid_blocks(sbi, curseg->segno)) { - old_segno = curseg->segno; - SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true); - locate_dirty_segment(sbi, old_segno); - } - } - + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) + __allocate_new_segment(sbi, i); up_write(&SIT_I(sbi)->sentry_lock); } From eb0e283d7af8d6bd002d3e685c5c1d2967b25e63 Mon Sep 17 00:00:00 2001 From: Jia Yang Date: Wed, 1 Jul 2020 10:27:40 +0800 Subject: [PATCH 025/580] f2fs: add parameter op_flag in f2fs_submit_page_read() The parameter op_flag is not used in f2fs_get_read_data_page(), but it is used in f2fs_grab_read_bio(). Obviously, op_flag is not passed to f2fs_grab_read_bio() successfully. We need to add parameter in f2fs_submit_page_read() to pass it. The case: - gc_data_segment - f2fs_get_read_data_page(.., op_flag = REQ_RAHEAD,..) - f2fs_submit_page_read - f2fs_grab_read_bio(.., op_flag = 0, ..) Signed-off-by: Jia Yang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 09db480b58d5..492b2540102a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1024,12 +1024,13 @@ static void f2fs_release_read_bio(struct bio *bio) /* This can handle encryption stuffs */ static int f2fs_submit_page_read(struct inode *inode, struct page *page, - block_t blkaddr, bool for_write) + block_t blkaddr, int op_flags, bool for_write) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio; - bio = f2fs_grab_read_bio(inode, blkaddr, 1, 0, page->index, for_write); + bio = f2fs_grab_read_bio(inode, blkaddr, 1, op_flags, + page->index, for_write); if (IS_ERR(bio)) return PTR_ERR(bio); @@ -1216,7 +1217,8 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, return page; } - err = f2fs_submit_page_read(inode, page, dn.data_blkaddr, for_write); + err = f2fs_submit_page_read(inode, page, dn.data_blkaddr, + op_flags, for_write); if (err) goto put_err; return page; @@ -3407,7 +3409,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, err = -EFSCORRUPTED; goto fail; } - err = f2fs_submit_page_read(inode, page, blkaddr, true); + err = f2fs_submit_page_read(inode, page, blkaddr, 0, true); if (err) goto fail; From cb88b596d1e287da18a2561d1c1f9a888449d08a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 1 Jul 2020 10:27:09 +0800 Subject: [PATCH 026/580] f2fs: fix return value of move_data_block() If f2fs_grab_cache_page() fails, it needs to return -ENOMEM. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 6eec3b2d606d..9a40761445d3 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -849,8 +849,10 @@ static int move_data_block(struct inode *inode, block_t bidx, mpage = f2fs_grab_cache_page(META_MAPPING(fio.sbi), fio.old_blkaddr, false); - if (!mpage) + if (!mpage) { + err = -ENOMEM; goto up_out; + } fio.encrypted_page = mpage; From 39efe102bdefc9c39fc6dbdd0fe67bbac62c1b65 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 22 Jun 2020 23:01:05 -0700 Subject: [PATCH 027/580] f2fs: avoid readahead race condition If two readahead threads having same offset enter in readpages, every read IOs are split and issued to the disk which giving lower bandwidth. This patch tries to avoid redundant readahead calls. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 18 +++++++++++++++++- fs/f2fs/f2fs.h | 1 + fs/f2fs/super.c | 2 ++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 492b2540102a..42b98055f9e2 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2298,6 +2298,7 @@ int f2fs_mpage_readpages(struct address_space *mapping, #endif unsigned max_nr_pages = nr_pages; int ret = 0; + bool drop_ra = false; map.m_pblk = 0; map.m_lblk = 0; @@ -2308,13 +2309,25 @@ int f2fs_mpage_readpages(struct address_space *mapping, map.m_seg_type = NO_CHECK_TYPE; map.m_may_create = false; + /* + * Two readahead threads for same address range can cause race condition + * which fragments sequential read IOs. So let's avoid each other. + */ + if (pages && is_readahead) { + page = list_last_entry(pages, struct page, lru); + if (READ_ONCE(F2FS_I(inode)->ra_offset) == page_index(page)) + drop_ra = true; + else + WRITE_ONCE(F2FS_I(inode)->ra_offset, page_index(page)); + } + for (; nr_pages; nr_pages--) { if (pages) { page = list_last_entry(pages, struct page, lru); prefetchw(&page->flags); list_del(&page->lru); - if (add_to_page_cache_lru(page, mapping, + if (drop_ra || add_to_page_cache_lru(page, mapping, page_index(page), readahead_gfp_mask(mapping))) goto next_page; @@ -2379,6 +2392,9 @@ int f2fs_mpage_readpages(struct address_space *mapping, BUG_ON(pages && !list_empty(pages)); if (bio) __submit_bio(F2FS_I_SB(inode), bio, DATA); + + if (pages && is_readahead && !drop_ra) + WRITE_ONCE(F2FS_I(inode)->ra_offset, -1); return pages ? 0 : ret; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 61936f3f02c2..39d3a760ecff 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -793,6 +793,7 @@ struct f2fs_inode_info { struct list_head inmem_pages; /* inmemory pages managed by f2fs */ struct task_struct *inmem_task; /* store inmemory task */ struct mutex inmem_lock; /* lock for inmemory pages */ + pgoff_t ra_offset; /* ongoing readahead offset */ struct extent_tree *extent_tree; /* cached extent_tree entry */ /* avoid racing between foreground op and gc */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 47c589e61684..64c207211675 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1011,6 +1011,8 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; + fi->ra_offset = -1; + return &fi->vfs_inode; } From b4ea8d7a51649d9c18cf7a4cb7c818906ab6bf47 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Tue, 30 Jun 2020 09:54:22 +0900 Subject: [PATCH 028/580] f2fs: add GC_URGENT_LOW mode in gc_urgent Added a new gc_urgent mode, GC_URGENT_LOW, in which mode F2FS will lower the bar of checking idle in order to process outstanding discard commands and GC a little bit aggressively. Signed-off-by: Daeho Jeong Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 4 +++- fs/f2fs/f2fs.h | 10 ++++++++-- fs/f2fs/gc.c | 6 +++--- fs/f2fs/segment.c | 4 ++-- fs/f2fs/sysfs.c | 6 ++++-- 5 files changed, 20 insertions(+), 10 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 4bb93a06d8ab..7f730c4c8df2 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -229,7 +229,9 @@ Date: August 2017 Contact: "Jaegeuk Kim" Description: Do background GC agressively when set. When gc_urgent = 1, background thread starts to do GC by given gc_urgent_sleep_time - interval. It is set to 0 by default. + interval. When gc_urgent = 2, F2FS will lower the bar of + checking idle in order to process outstanding discard commands + and GC a little bit aggressively. It is set to 0 by default. What: /sys/fs/f2fs//gc_urgent_sleep_time Date: August 2017 diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 39d3a760ecff..2b958c69e525 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1267,7 +1267,8 @@ enum { GC_NORMAL, GC_IDLE_CB, GC_IDLE_GREEDY, - GC_URGENT, + GC_URGENT_HIGH, + GC_URGENT_LOW, }; enum { @@ -1523,6 +1524,7 @@ struct f2fs_sb_info { unsigned int cur_victim_sec; /* current victim section num */ unsigned int gc_mode; /* current GC state */ unsigned int next_victim_seg[2]; /* next segment in victim section */ + /* for skip statistic */ unsigned int atomic_files; /* # of opened atomic file */ unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ @@ -2464,7 +2466,7 @@ static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, static inline bool is_idle(struct f2fs_sb_info *sbi, int type) { - if (sbi->gc_mode == GC_URGENT) + if (sbi->gc_mode == GC_URGENT_HIGH) return true; if (get_pages(sbi, F2FS_RD_DATA) || get_pages(sbi, F2FS_RD_NODE) || @@ -2482,6 +2484,10 @@ static inline bool is_idle(struct f2fs_sb_info *sbi, int type) atomic_read(&SM_I(sbi)->fcc_info->queued_flush)) return false; + if (sbi->gc_mode == GC_URGENT_LOW && + (type == DISCARD_TIME || type == GC_TIME)) + return true; + return f2fs_time_over(sbi, type); } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 9a40761445d3..11b4adde9baf 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -82,7 +82,7 @@ static int gc_thread_func(void *data) * invalidated soon after by user update or deletion. * So, I'd like to wait some time to collect dirty segments. */ - if (sbi->gc_mode == GC_URGENT) { + if (sbi->gc_mode == GC_URGENT_HIGH) { wait_ms = gc_th->urgent_sleep_time; down_write(&sbi->gc_lock); goto do_gc; @@ -176,7 +176,7 @@ static int select_gc_type(struct f2fs_sb_info *sbi, int gc_type) gc_mode = GC_CB; break; case GC_IDLE_GREEDY: - case GC_URGENT: + case GC_URGENT_HIGH: gc_mode = GC_GREEDY; break; } @@ -211,7 +211,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, * foreground GC and urgent GC cases. */ if (gc_type != FG_GC && - (sbi->gc_mode != GC_URGENT) && + (sbi->gc_mode != GC_URGENT_HIGH) && p->max_search > sbi->max_victim_search) p->max_search = sbi->max_victim_search; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d87ed33b64a3..543878c6222c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -174,7 +174,7 @@ bool f2fs_need_SSR(struct f2fs_sb_info *sbi) if (f2fs_lfs_mode(sbi)) return false; - if (sbi->gc_mode == GC_URGENT) + if (sbi->gc_mode == GC_URGENT_HIGH) return true; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) return true; @@ -1759,7 +1759,7 @@ static int issue_discard_thread(void *data) continue; } - if (sbi->gc_mode == GC_URGENT) + if (sbi->gc_mode == GC_URGENT_HIGH) __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); sb_start_intwrite(sbi->sb); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 0b743caec1cd..0963cd703b29 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -350,14 +350,16 @@ static ssize_t __sbi_store(struct f2fs_attr *a, return -EINVAL; if (!strcmp(a->attr.name, "gc_urgent")) { - if (t >= 1) { - sbi->gc_mode = GC_URGENT; + if (t == 1) { + sbi->gc_mode = GC_URGENT_HIGH; if (sbi->gc_thread) { sbi->gc_thread->gc_wake = 1; wake_up_interruptible_all( &sbi->gc_thread->gc_wait_queue_head); wake_up_discard_thread(sbi, true); } + } else if (t == 2) { + sbi->gc_mode = GC_URGENT_LOW; } else { sbi->gc_mode = GC_NORMAL; } From 163c2480efe46b07e35ab17bde8f096032072795 Mon Sep 17 00:00:00 2001 From: Dehe Gu Date: Fri, 3 Jul 2020 17:51:29 +0800 Subject: [PATCH 029/580] f2fs: remove write attribute of main_blkaddr sysfs node Fuzzing main_blkaddr sysfs node will corrupt this field's value, causing kernel panic, remove its write attribute to avoid potential security risk. [Chao Yu: add description] Signed-off-by: Dehe Gu Signed-off-by: Daiyue Zhang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 0963cd703b29..64dc083a82cc 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -223,6 +223,13 @@ static ssize_t avg_vblocks_show(struct f2fs_attr *a, } #endif +static ssize_t main_blkaddr_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)MAIN_BLKADDR(sbi)); +} + static ssize_t f2fs_sbi_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -524,7 +531,6 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle, gc_mode); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent, gc_mode); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, main_blkaddr, main_blkaddr); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); @@ -567,6 +573,7 @@ F2FS_GENERAL_RO_ATTR(current_reserved_blocks); F2FS_GENERAL_RO_ATTR(unusable); F2FS_GENERAL_RO_ATTR(encoding); F2FS_GENERAL_RO_ATTR(mounted_time_sec); +F2FS_GENERAL_RO_ATTR(main_blkaddr); #ifdef CONFIG_F2FS_STAT_FS F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_foreground_calls, cp_count); F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_background_calls, bg_cp_count); From dc6dd1e701ea25110b1590896e68b1ac8673347d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 3 Jul 2020 16:40:11 +0800 Subject: [PATCH 030/580] f2fs: fix to wait GCed compressed page writeback like we did for encrypted page. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 939fd3af5a17..69ad6a331830 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1123,6 +1123,13 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, f2fs_set_compressed_page(cc->cpages[i], inode, cc->rpages[i + 1]->index, cic); fio.compressed_page = cc->cpages[i]; + + fio.old_blkaddr = data_blkaddr(dn.inode, dn.node_page, + dn.ofs_in_node + i + 1); + + /* wait for GCed page writeback via META_MAPPING */ + f2fs_wait_on_block_writeback(inode, fio.old_blkaddr); + if (fio.encrypted) { fio.page = cc->rpages[i + 1]; err = f2fs_encrypt_one_page(&fio); From 00974bd2b6442d5bed92639831b2370076a949f5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 6 Jul 2020 18:23:36 +0800 Subject: [PATCH 031/580] f2fs: fix error path in do_recover_data() - don't panic kernel if f2fs_get_node_page() fails in f2fs_recover_inline_data() or f2fs_recover_inline_xattr(); - return error number of f2fs_truncate_blocks() to f2fs_recover_inline_data()'s caller; Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 ++-- fs/f2fs/inline.c | 19 ++++++++++++------- fs/f2fs/node.c | 6 ++++-- fs/f2fs/recovery.c | 10 ++++++++-- 4 files changed, 26 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2b958c69e525..a4c68b62c4e9 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3308,7 +3308,7 @@ bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid); void f2fs_alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); void f2fs_alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid); int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink); -void f2fs_recover_inline_xattr(struct inode *inode, struct page *page); +int f2fs_recover_inline_xattr(struct inode *inode, struct page *page); int f2fs_recover_xattr_data(struct inode *inode, struct page *page); int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, @@ -3776,7 +3776,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page); int f2fs_convert_inline_inode(struct inode *inode); int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry); int f2fs_write_inline_data(struct inode *inode, struct page *page); -bool f2fs_recover_inline_data(struct inode *inode, struct page *npage); +int f2fs_recover_inline_data(struct inode *inode, struct page *npage); struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir, const struct f2fs_filename *fname, struct page **res_page); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 652b972dfd7b..c4a51a3a257e 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -253,7 +253,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) return 0; } -bool f2fs_recover_inline_data(struct inode *inode, struct page *npage) +int f2fs_recover_inline_data(struct inode *inode, struct page *npage) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode *ri = NULL; @@ -275,7 +275,8 @@ bool f2fs_recover_inline_data(struct inode *inode, struct page *npage) ri && (ri->i_inline & F2FS_INLINE_DATA)) { process_inline: ipage = f2fs_get_node_page(sbi, inode->i_ino); - f2fs_bug_on(sbi, IS_ERR(ipage)); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); f2fs_wait_on_page_writeback(ipage, NODE, true, true); @@ -288,21 +289,25 @@ bool f2fs_recover_inline_data(struct inode *inode, struct page *npage) set_page_dirty(ipage); f2fs_put_page(ipage, 1); - return true; + return 1; } if (f2fs_has_inline_data(inode)) { ipage = f2fs_get_node_page(sbi, inode->i_ino); - f2fs_bug_on(sbi, IS_ERR(ipage)); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); f2fs_truncate_inline_inode(inode, ipage, 0); clear_inode_flag(inode, FI_INLINE_DATA); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { - if (f2fs_truncate_blocks(inode, 0, false)) - return false; + int ret; + + ret = f2fs_truncate_blocks(inode, 0, false); + if (ret) + return ret; goto process_inline; } - return false; + return 0; } struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 4c5dd66351de..c377560b9c13 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2572,7 +2572,7 @@ int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) return nr - nr_shrink; } -void f2fs_recover_inline_xattr(struct inode *inode, struct page *page) +int f2fs_recover_inline_xattr(struct inode *inode, struct page *page) { void *src_addr, *dst_addr; size_t inline_size; @@ -2580,7 +2580,8 @@ void f2fs_recover_inline_xattr(struct inode *inode, struct page *page) struct f2fs_inode *ri; ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); - f2fs_bug_on(F2FS_I_SB(inode), IS_ERR(ipage)); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); ri = F2FS_INODE(page); if (ri->i_inline & F2FS_INLINE_XATTR) { @@ -2599,6 +2600,7 @@ void f2fs_recover_inline_xattr(struct inode *inode, struct page *page) update_inode: f2fs_update_inode(inode, ipage); f2fs_put_page(ipage, 1); + return 0; } int f2fs_recover_xattr_data(struct inode *inode, struct page *page) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 6fc0d085e460..c22149ea46fd 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -544,7 +544,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, /* step 1: recover xattr */ if (IS_INODE(page)) { - f2fs_recover_inline_xattr(inode, page); + err = f2fs_recover_inline_xattr(inode, page); + if (err) + goto out; } else if (f2fs_has_xattr_block(ofs_of_node(page))) { err = f2fs_recover_xattr_data(inode, page); if (!err) @@ -553,8 +555,12 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, } /* step 2: recover inline data */ - if (f2fs_recover_inline_data(inode, page)) + err = f2fs_recover_inline_data(inode, page); + if (err) { + if (err == 1) + err = 0; goto out; + } /* step 3: recover data indices */ start = f2fs_start_bidx_of_node(ofs_of_node(page), inode); From d2f2997d9cecd0589b9c3933d7d86c78b6a93e9a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 7 Jul 2020 09:35:41 +0800 Subject: [PATCH 032/580] f2fs: don't keep meta inode pages used for compressed block migration meta inode's pages are used for encrypted, verity and compressed blocks, so the meta inode's cache invalidation condition in do_checkpoint() should consider compression as well, not just for verity and encryption, fix it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 236064930251..8f5753cc3037 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1515,9 +1515,10 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* * invalidate intermediate page cache borrowed from meta inode which are - * used for migration of encrypted or verity inode's blocks. + * used for migration of encrypted, verity or compressed inode's blocks. */ - if (f2fs_sb_has_encrypt(sbi) || f2fs_sb_has_verity(sbi)) + if (f2fs_sb_has_encrypt(sbi) || f2fs_sb_has_verity(sbi) || + f2fs_sb_has_compression(sbi)) invalidate_mapping_pages(META_MAPPING(sbi), MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1); From ca71e2b3768493a823333b83980f1e4e4c529350 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 14 Jul 2020 15:18:12 -0700 Subject: [PATCH 033/580] f2fs: use generic names for generic ioctls Don't define F2FS_IOC_* aliases to ioctls that already have a generic FS_IOC_* name. These aliases are unnecessary, and they make it unclear which ioctls are f2fs-specific and which are generic. Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 25 +---------------------- fs/f2fs/file.c | 54 +++++++++++++++++++++++++------------------------- 2 files changed, 28 insertions(+), 51 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a4c68b62c4e9..06db56a4bf34 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -401,12 +401,8 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, } /* - * ioctl commands + * f2fs-specific ioctl commands */ -#define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS -#define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS -#define F2FS_IOC_GETVERSION FS_IOC_GETVERSION - #define F2FS_IOCTL_MAGIC 0xf5 #define F2FS_IOC_START_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 1) #define F2FS_IOC_COMMIT_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 2) @@ -434,13 +430,6 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_IOC_RESERVE_COMPRESS_BLOCKS \ _IOR(F2FS_IOCTL_MAGIC, 19, __u64) -#define F2FS_IOC_GET_VOLUME_NAME FS_IOC_GETFSLABEL -#define F2FS_IOC_SET_VOLUME_NAME FS_IOC_SETFSLABEL - -#define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY -#define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY -#define F2FS_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT - /* * should be same as XFS_IOC_GOINGDOWN. * Flags for going down operation used by FS_IOC_GOINGDOWN @@ -452,18 +441,6 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_GOING_DOWN_METAFLUSH 0x3 /* going down with meta flush */ #define F2FS_GOING_DOWN_NEED_FSCK 0x4 /* going down to trigger fsck */ -#if defined(__KERNEL__) && defined(CONFIG_COMPAT) -/* - * ioctl commands in 32 bit emulation - */ -#define F2FS_IOC32_GETFLAGS FS_IOC32_GETFLAGS -#define F2FS_IOC32_SETFLAGS FS_IOC32_SETFLAGS -#define F2FS_IOC32_GETVERSION FS_IOC32_GETVERSION -#endif - -#define F2FS_IOC_FSGETXATTR FS_IOC_FSGETXATTR -#define F2FS_IOC_FSSETXATTR FS_IOC_FSSETXATTR - struct f2fs_gc_range { u32 sync; u64 start; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a0add6b1bbc4..f073f257229c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -3365,7 +3365,7 @@ static int f2fs_ioc_measure_verity(struct file *filp, unsigned long arg) return fsverity_ioctl_measure(filp, (void __user *)arg); } -static int f2fs_get_volume_name(struct file *filp, unsigned long arg) +static int f2fs_ioc_getfslabel(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -3391,7 +3391,7 @@ static int f2fs_get_volume_name(struct file *filp, unsigned long arg) return err; } -static int f2fs_set_volume_name(struct file *filp, unsigned long arg) +static int f2fs_ioc_setfslabel(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -3770,11 +3770,11 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return -ENOSPC; switch (cmd) { - case F2FS_IOC_GETFLAGS: + case FS_IOC_GETFLAGS: return f2fs_ioc_getflags(filp, arg); - case F2FS_IOC_SETFLAGS: + case FS_IOC_SETFLAGS: return f2fs_ioc_setflags(filp, arg); - case F2FS_IOC_GETVERSION: + case FS_IOC_GETVERSION: return f2fs_ioc_getversion(filp, arg); case F2FS_IOC_START_ATOMIC_WRITE: return f2fs_ioc_start_atomic_write(filp); @@ -3790,11 +3790,11 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_shutdown(filp, arg); case FITRIM: return f2fs_ioc_fitrim(filp, arg); - case F2FS_IOC_SET_ENCRYPTION_POLICY: + case FS_IOC_SET_ENCRYPTION_POLICY: return f2fs_ioc_set_encryption_policy(filp, arg); - case F2FS_IOC_GET_ENCRYPTION_POLICY: + case FS_IOC_GET_ENCRYPTION_POLICY: return f2fs_ioc_get_encryption_policy(filp, arg); - case F2FS_IOC_GET_ENCRYPTION_PWSALT: + case FS_IOC_GET_ENCRYPTION_PWSALT: return f2fs_ioc_get_encryption_pwsalt(filp, arg); case FS_IOC_GET_ENCRYPTION_POLICY_EX: return f2fs_ioc_get_encryption_policy_ex(filp, arg); @@ -3822,9 +3822,9 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_flush_device(filp, arg); case F2FS_IOC_GET_FEATURES: return f2fs_ioc_get_features(filp, arg); - case F2FS_IOC_FSGETXATTR: + case FS_IOC_FSGETXATTR: return f2fs_ioc_fsgetxattr(filp, arg); - case F2FS_IOC_FSSETXATTR: + case FS_IOC_FSSETXATTR: return f2fs_ioc_fssetxattr(filp, arg); case F2FS_IOC_GET_PIN_FILE: return f2fs_ioc_get_pin_file(filp, arg); @@ -3838,10 +3838,10 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_enable_verity(filp, arg); case FS_IOC_MEASURE_VERITY: return f2fs_ioc_measure_verity(filp, arg); - case F2FS_IOC_GET_VOLUME_NAME: - return f2fs_get_volume_name(filp, arg); - case F2FS_IOC_SET_VOLUME_NAME: - return f2fs_set_volume_name(filp, arg); + case FS_IOC_GETFSLABEL: + return f2fs_ioc_getfslabel(filp, arg); + case FS_IOC_SETFSLABEL: + return f2fs_ioc_setfslabel(filp, arg); case F2FS_IOC_GET_COMPRESS_BLOCKS: return f2fs_get_compress_blocks(filp, arg); case F2FS_IOC_RELEASE_COMPRESS_BLOCKS: @@ -3972,14 +3972,14 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { switch (cmd) { - case F2FS_IOC32_GETFLAGS: - cmd = F2FS_IOC_GETFLAGS; + case FS_IOC32_GETFLAGS: + cmd = FS_IOC_GETFLAGS; break; - case F2FS_IOC32_SETFLAGS: - cmd = F2FS_IOC_SETFLAGS; + case FS_IOC32_SETFLAGS: + cmd = FS_IOC_SETFLAGS; break; - case F2FS_IOC32_GETVERSION: - cmd = F2FS_IOC_GETVERSION; + case FS_IOC32_GETVERSION: + cmd = FS_IOC_GETVERSION; break; case F2FS_IOC_START_ATOMIC_WRITE: case F2FS_IOC_COMMIT_ATOMIC_WRITE: @@ -3987,9 +3987,9 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_RELEASE_VOLATILE_WRITE: case F2FS_IOC_ABORT_VOLATILE_WRITE: case F2FS_IOC_SHUTDOWN: - case F2FS_IOC_SET_ENCRYPTION_POLICY: - case F2FS_IOC_GET_ENCRYPTION_PWSALT: - case F2FS_IOC_GET_ENCRYPTION_POLICY: + case FS_IOC_SET_ENCRYPTION_POLICY: + case FS_IOC_GET_ENCRYPTION_PWSALT: + case FS_IOC_GET_ENCRYPTION_POLICY: case FS_IOC_GET_ENCRYPTION_POLICY_EX: case FS_IOC_ADD_ENCRYPTION_KEY: case FS_IOC_REMOVE_ENCRYPTION_KEY: @@ -4003,16 +4003,16 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_MOVE_RANGE: case F2FS_IOC_FLUSH_DEVICE: case F2FS_IOC_GET_FEATURES: - case F2FS_IOC_FSGETXATTR: - case F2FS_IOC_FSSETXATTR: + case FS_IOC_FSGETXATTR: + case FS_IOC_FSSETXATTR: case F2FS_IOC_GET_PIN_FILE: case F2FS_IOC_SET_PIN_FILE: case F2FS_IOC_PRECACHE_EXTENTS: case F2FS_IOC_RESIZE_FS: case FS_IOC_ENABLE_VERITY: case FS_IOC_MEASURE_VERITY: - case F2FS_IOC_GET_VOLUME_NAME: - case F2FS_IOC_SET_VOLUME_NAME: + case FS_IOC_GETFSLABEL: + case FS_IOC_SETFSLABEL: case F2FS_IOC_GET_COMPRESS_BLOCKS: case F2FS_IOC_RELEASE_COMPRESS_BLOCKS: case F2FS_IOC_RESERVE_COMPRESS_BLOCKS: From 7a9e6acbaefaaecbe4f5e10902fd16c6189d0148 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 20 Jul 2020 16:52:50 +0800 Subject: [PATCH 034/580] f2fs: compress: fix to avoid memory leak on cc->cpages Memory allocated for storing compressed pages' poitner should be released after f2fs_write_compressed_pages(), otherwise it will cause memory leak issue. Signed-off-by: Chao Yu Fixes: 4c8ff7095bef ("f2fs: support data compression") Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 69ad6a331830..5d0788953308 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1356,6 +1356,8 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, err = f2fs_write_compressed_pages(cc, submitted, wbc, io_type); cops->destroy_compress_ctx(cc); + kfree(cc->cpages); + cc->cpages = NULL; if (!err) return 0; f2fs_bug_on(F2FS_I_SB(cc->inode), err != -EAGAIN); From 356825ac3ed40f1c675de52cd1159b8bf82f70a1 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 19 Jul 2020 17:13:44 -0700 Subject: [PATCH 035/580] f2fs: segment.h: delete a duplicated word Drop the repeated word "the" in a comment. Signed-off-by: Randy Dunlap Cc: Jaegeuk Kim Cc: Chao Yu Cc: linux-f2fs-devel@lists.sourceforge.net Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index f261e3e6a69b..752b177073b2 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -187,7 +187,7 @@ struct seg_entry { unsigned char *cur_valid_map_mir; /* mirror of current valid bitmap */ #endif /* - * # of valid blocks and the validity bitmap stored in the the last + * # of valid blocks and the validity bitmap stored in the last * checkpoint pack. This information is used by the SSR mode. */ unsigned char *ckpt_valid_map; /* validity bitmap of blocks last cp */ From 21d5367e19cbfb367069d4dc7a66ff010dd51c73 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 16 Jul 2020 09:57:03 -0700 Subject: [PATCH 036/580] f2fs: should avoid inode eviction in synchronous path https://bugzilla.kernel.org/show_bug.cgi?id=208565 PID: 257 TASK: ecdd0000 CPU: 0 COMMAND: "init" #0 [] (__schedule) from [] #1 [] (schedule) from [] #2 [] (rwsem_down_read_failed) from [] #3 [] (down_read) from [] #4 [] (f2fs_truncate_blocks) from [] #5 [] (f2fs_truncate) from [] #6 [] (f2fs_evict_inode) from [] #7 [] (evict) from [] #8 [] (iput) from [] #9 [] (f2fs_sync_node_pages) from [] #10 [] (f2fs_write_checkpoint) from [] #11 [] (f2fs_sync_fs) from [] #12 [] (f2fs_do_sync_file) from [] #13 [] (f2fs_sync_file) from [] #14 [] (vfs_fsync_range) from [] #15 [] (do_fsync) from [] #16 [] (sys_fsync) from [] This can be caused by flush_dirty_inode() in f2fs_sync_node_pages() where iput() requires f2fs_lock_op() again resulting in livelock. Reported-by: Zhiguo Niu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index c377560b9c13..0dee8b41ec30 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1926,8 +1926,12 @@ int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, goto continue_unlock; } - /* flush inline_data, if it's async context. */ - if (do_balance && is_inline_node(page)) { + /* flush inline_data/inode, if it's async context. */ + if (!do_balance) + goto write_node; + + /* flush inline_data */ + if (is_inline_node(page)) { clear_inline_node(page); unlock_page(page); flush_inline_data(sbi, ino_of_node(page)); @@ -1940,7 +1944,7 @@ int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, if (flush_dirty_inode(page)) goto lock_node; } - +write_node: f2fs_wait_on_page_writeback(page, NODE, true, true); if (!clear_page_dirty_for_io(page)) From cf21898db2483f24435db6ef0ae66d955aff04b6 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Tue, 21 Jul 2020 12:21:11 +0900 Subject: [PATCH 037/580] f2fs: add F2FS_IOC_SEC_TRIM_FILE ioctl Added a new ioctl to send discard commands or/and zero out to selected data area of a regular file for security reason. The way of handling range.len of F2FS_IOC_SEC_TRIM_FILE: 1. Added -1 value support for range.len to secure trim the whole blocks starting from range.start regardless of i_size. 2. If the end of the range passes over the end of file, it means until the end of file (i_size). 3. ignored the case of that range.len is zero to prevent the function from making end_addr zero and triggering different behaviour of the function. Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 15 ++++ fs/f2fs/file.c | 191 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 206 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 06db56a4bf34..4ec25eaa13fd 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -429,6 +429,8 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, _IOR(F2FS_IOCTL_MAGIC, 18, __u64) #define F2FS_IOC_RESERVE_COMPRESS_BLOCKS \ _IOR(F2FS_IOCTL_MAGIC, 19, __u64) +#define F2FS_IOC_SEC_TRIM_FILE _IOW(F2FS_IOCTL_MAGIC, 20, \ + struct f2fs_sectrim_range) /* * should be same as XFS_IOC_GOINGDOWN. @@ -441,6 +443,13 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_GOING_DOWN_METAFLUSH 0x3 /* going down with meta flush */ #define F2FS_GOING_DOWN_NEED_FSCK 0x4 /* going down to trigger fsck */ +/* + * Flags used by F2FS_IOC_SEC_TRIM_FILE + */ +#define F2FS_TRIM_FILE_DISCARD 0x1 /* send discard command */ +#define F2FS_TRIM_FILE_ZEROOUT 0x2 /* zero out */ +#define F2FS_TRIM_FILE_MASK 0x3 + struct f2fs_gc_range { u32 sync; u64 start; @@ -464,6 +473,12 @@ struct f2fs_flush_device { u32 segments; /* # of segments to flush */ }; +struct f2fs_sectrim_range { + u64 start; + u64 len; + u64 flags; +}; + /* for inline stuff */ #define DEF_INLINE_RESERVED_SIZE 1 static inline int get_extra_isize(struct inode *inode); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f073f257229c..a474799fb126 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" @@ -3762,6 +3763,193 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) return ret; } +static int f2fs_secure_erase(struct block_device *bdev, struct inode *inode, + pgoff_t off, block_t block, block_t len, u32 flags) +{ + struct request_queue *q = bdev_get_queue(bdev); + sector_t sector = SECTOR_FROM_BLOCK(block); + sector_t nr_sects = SECTOR_FROM_BLOCK(len); + int ret = 0; + + if (!q) + return -ENXIO; + + if (flags & F2FS_TRIM_FILE_DISCARD) + ret = blkdev_issue_discard(bdev, sector, nr_sects, GFP_NOFS, + blk_queue_secure_erase(q) ? + BLKDEV_DISCARD_SECURE : 0); + + if (!ret && (flags & F2FS_TRIM_FILE_ZEROOUT)) { + if (IS_ENCRYPTED(inode)) + ret = fscrypt_zeroout_range(inode, off, block, len); + else + ret = blkdev_issue_zeroout(bdev, sector, nr_sects, + GFP_NOFS, 0); + } + + return ret; +} + +static int f2fs_sec_trim_file(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct address_space *mapping = inode->i_mapping; + struct block_device *prev_bdev = NULL; + struct f2fs_sectrim_range range; + pgoff_t index, pg_end, prev_index = 0; + block_t prev_block = 0, len = 0; + loff_t end_addr; + bool to_end = false; + int ret = 0; + + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(&range, (struct f2fs_sectrim_range __user *)arg, + sizeof(range))) + return -EFAULT; + + if (range.flags == 0 || (range.flags & ~F2FS_TRIM_FILE_MASK) || + !S_ISREG(inode->i_mode)) + return -EINVAL; + + if (((range.flags & F2FS_TRIM_FILE_DISCARD) && + !f2fs_hw_support_discard(sbi)) || + ((range.flags & F2FS_TRIM_FILE_ZEROOUT) && + IS_ENCRYPTED(inode) && f2fs_is_multi_device(sbi))) + return -EOPNOTSUPP; + + file_start_write(filp); + inode_lock(inode); + + if (f2fs_is_atomic_file(inode) || f2fs_compressed_file(inode) || + range.start >= inode->i_size) { + ret = -EINVAL; + goto err; + } + + if (range.len == 0) + goto err; + + if (inode->i_size - range.start > range.len) { + end_addr = range.start + range.len; + } else { + end_addr = range.len == (u64)-1 ? + sbi->sb->s_maxbytes : inode->i_size; + to_end = true; + } + + if (!IS_ALIGNED(range.start, F2FS_BLKSIZE) || + (!to_end && !IS_ALIGNED(end_addr, F2FS_BLKSIZE))) { + ret = -EINVAL; + goto err; + } + + index = F2FS_BYTES_TO_BLK(range.start); + pg_end = DIV_ROUND_UP(end_addr, F2FS_BLKSIZE); + + ret = f2fs_convert_inline_inode(inode); + if (ret) + goto err; + + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); + + ret = filemap_write_and_wait_range(mapping, range.start, + to_end ? LLONG_MAX : end_addr - 1); + if (ret) + goto out; + + truncate_inode_pages_range(mapping, range.start, + to_end ? -1 : end_addr - 1); + + while (index < pg_end) { + struct dnode_of_data dn; + pgoff_t end_offset, count; + int i; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (ret) { + if (ret == -ENOENT) { + index = f2fs_get_next_page_offset(&dn, index); + continue; + } + goto out; + } + + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + count = min(end_offset - dn.ofs_in_node, pg_end - index); + for (i = 0; i < count; i++, index++, dn.ofs_in_node++) { + struct block_device *cur_bdev; + block_t blkaddr = f2fs_data_blkaddr(&dn); + + if (!__is_valid_data_blkaddr(blkaddr)) + continue; + + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, + DATA_GENERIC_ENHANCE)) { + ret = -EFSCORRUPTED; + f2fs_put_dnode(&dn); + goto out; + } + + cur_bdev = f2fs_target_device(sbi, blkaddr, NULL); + if (f2fs_is_multi_device(sbi)) { + int di = f2fs_target_device_index(sbi, blkaddr); + + blkaddr -= FDEV(di).start_blk; + } + + if (len) { + if (prev_bdev == cur_bdev && + index == prev_index + len && + blkaddr == prev_block + len) { + len++; + } else { + ret = f2fs_secure_erase(prev_bdev, + inode, prev_index, prev_block, + len, range.flags); + if (ret) { + f2fs_put_dnode(&dn); + goto out; + } + + len = 0; + } + } + + if (!len) { + prev_bdev = cur_bdev; + prev_index = index; + prev_block = blkaddr; + len = 1; + } + } + + f2fs_put_dnode(&dn); + + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out; + } + cond_resched(); + } + + if (len) + ret = f2fs_secure_erase(prev_bdev, inode, prev_index, + prev_block, len, range.flags); +out: + up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); +err: + inode_unlock(inode); + file_end_write(filp); + + return ret; +} + long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp))))) @@ -3848,6 +4036,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_release_compress_blocks(filp, arg); case F2FS_IOC_RESERVE_COMPRESS_BLOCKS: return f2fs_reserve_compress_blocks(filp, arg); + case F2FS_IOC_SEC_TRIM_FILE: + return f2fs_sec_trim_file(filp, arg); default: return -ENOTTY; } @@ -4016,6 +4206,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_GET_COMPRESS_BLOCKS: case F2FS_IOC_RELEASE_COMPRESS_BLOCKS: case F2FS_IOC_RESERVE_COMPRESS_BLOCKS: + case F2FS_IOC_SEC_TRIM_FILE: break; default: return -ENOIOCTLCMD; From 6418e258cb1eebabf96a072445c57407b4e1fe04 Mon Sep 17 00:00:00 2001 From: Jia Yang Date: Tue, 21 Jul 2020 11:49:14 +0800 Subject: [PATCH 038/580] f2fs: Change the type of f2fs_flush_inline_data() to void The return value of f2fs_flush_inline_data() is not used, so delete it. Signed-off-by: Jia Yang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/node.c | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4ec25eaa13fd..6abf11a4256f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3288,7 +3288,7 @@ void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); struct page *f2fs_get_node_page_ra(struct page *parent, int start); int f2fs_move_node_page(struct page *node_page, int gc_type); -int f2fs_flush_inline_data(struct f2fs_sb_info *sbi); +void f2fs_flush_inline_data(struct f2fs_sb_info *sbi); int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic, unsigned int *seq_id); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 0dee8b41ec30..2eb71a6d896b 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1816,12 +1816,11 @@ static bool flush_dirty_inode(struct page *page) return true; } -int f2fs_flush_inline_data(struct f2fs_sb_info *sbi) +void f2fs_flush_inline_data(struct f2fs_sb_info *sbi) { pgoff_t index = 0; struct pagevec pvec; int nr_pages; - int ret = 0; pagevec_init(&pvec); @@ -1860,7 +1859,6 @@ int f2fs_flush_inline_data(struct f2fs_sb_info *sbi) pagevec_release(&pvec); cond_resched(); } - return ret; } int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, From df3a71af2d083d232ab41faef559d625badee058 Mon Sep 17 00:00:00 2001 From: Li Guifu Date: Fri, 24 Jul 2020 09:38:11 +0800 Subject: [PATCH 039/580] f2fs: fix use-after-free issue During umount, f2fs_put_super() unregisters procfs entries after f2fs_destroy_segment_manager(), it may cause use-after-free issue when umount races with procfs accessing, fix it by relocating f2fs_unregister_sysfs(). [Chao Yu: change commit title/message a bit] Signed-off-by: Li Guifu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 64c207211675..d215c9f40d95 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1179,6 +1179,9 @@ static void f2fs_put_super(struct super_block *sb) int i; bool dropped; + /* unregister procfs/sysfs entries in advance to avoid race case */ + f2fs_unregister_sysfs(sbi); + f2fs_quota_off_umount(sb); /* prevent remaining shrinker jobs */ @@ -1244,8 +1247,6 @@ static void f2fs_put_super(struct super_block *sb) kvfree(sbi->ckpt); - f2fs_unregister_sysfs(sbi); - sb->s_fs_info = NULL; if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); From 6eac5b985a36617834aaf917c5664b8193de1d3f Mon Sep 17 00:00:00 2001 From: Jack Qiu Date: Fri, 24 Jul 2020 16:55:28 +0800 Subject: [PATCH 040/580] f2fs: space related cleanup Just for code style, no logic change 1. delete useless space 2. change spaces into tab Signed-off-by: Jack Qiu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/data.c | 4 ++-- fs/f2fs/f2fs.h | 4 ++-- fs/f2fs/namei.c | 8 ++++---- fs/f2fs/node.c | 4 ++-- fs/f2fs/sysfs.c | 4 ++-- fs/f2fs/xattr.c | 4 ++-- 7 files changed, 15 insertions(+), 15 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 8f5753cc3037..f86c6ba7cb82 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1415,7 +1415,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) curseg_alloc_type(sbi, i + CURSEG_HOT_DATA); } - /* 2 cp + n data seg summary + orphan inode blocks */ + /* 2 cp + n data seg summary + orphan inode blocks */ data_sum_blocks = f2fs_npages_for_summary_flush(sbi, false); spin_lock_irqsave(&sbi->cp_lock, flags); if (data_sum_blocks < NR_CURSEG_DATA_TYPE) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 42b98055f9e2..14ab1692157f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -85,7 +85,7 @@ static bool __is_cp_guaranteed(struct page *page) sbi = F2FS_I_SB(inode); if (inode->i_ino == F2FS_META_INO(sbi) || - inode->i_ino == F2FS_NODE_INO(sbi) || + inode->i_ino == F2FS_NODE_INO(sbi) || S_ISDIR(inode->i_mode) || (S_ISREG(inode->i_mode) && (f2fs_is_atomic_file(inode) || IS_NOQUOTA(inode))) || @@ -1145,7 +1145,7 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) { - struct extent_info ei = {0,0,0}; + struct extent_info ei = {0, 0, 0}; struct inode *inode = dn->inode; if (f2fs_lookup_extent_cache(inode, index, &ei)) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6abf11a4256f..c6e9064c68e8 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1438,7 +1438,7 @@ struct f2fs_sb_info { unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ long interval_time[MAX_TIME]; /* to store thresholds */ - struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ + struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ spinlock_t fsync_node_lock; /* for node entry lock */ struct list_head fsync_node_list; /* node list head */ @@ -1518,7 +1518,7 @@ struct f2fs_sb_info { unsigned int next_victim_seg[2]; /* next segment in victim section */ /* for skip statistic */ - unsigned int atomic_files; /* # of opened atomic file */ + unsigned int atomic_files; /* # of opened atomic file */ unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ unsigned long long skipped_gc_rwsem; /* FG_GC only */ diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a15a2831d43b..84e4bbc1a64d 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -602,7 +602,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid * invalidating the dentries here, alongside with returning the - * negative dentries at f2fs_lookup(), when it is better + * negative dentries at f2fs_lookup(), when it is better * supported by the VFS for the CI case. */ if (IS_CASEFOLDED(dir)) @@ -1287,7 +1287,7 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry, } const struct inode_operations f2fs_encrypted_symlink_inode_operations = { - .get_link = f2fs_encrypted_get_link, + .get_link = f2fs_encrypted_get_link, .getattr = f2fs_getattr, .setattr = f2fs_setattr, .listxattr = f2fs_listxattr, @@ -1313,7 +1313,7 @@ const struct inode_operations f2fs_dir_inode_operations = { }; const struct inode_operations f2fs_symlink_inode_operations = { - .get_link = f2fs_get_link, + .get_link = f2fs_get_link, .getattr = f2fs_getattr, .setattr = f2fs_setattr, .listxattr = f2fs_listxattr, @@ -1321,7 +1321,7 @@ const struct inode_operations f2fs_symlink_inode_operations = { const struct inode_operations f2fs_special_inode_operations = { .getattr = f2fs_getattr, - .setattr = f2fs_setattr, + .setattr = f2fs_setattr, .get_acl = f2fs_get_acl, .set_acl = f2fs_set_acl, .listxattr = f2fs_listxattr, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 2eb71a6d896b..f1a9150add78 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1728,7 +1728,7 @@ int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, set_dentry_mark(page, f2fs_need_dentry_mark(sbi, ino)); } - /* may be written by other thread */ + /* may be written by other thread */ if (!PageDirty(page)) set_page_dirty(page); } @@ -2101,7 +2101,7 @@ const struct address_space_operations f2fs_node_aops = { .invalidatepage = f2fs_invalidate_page, .releasepage = f2fs_release_page, #ifdef CONFIG_MIGRATION - .migratepage = f2fs_migrate_page, + .migratepage = f2fs_migrate_page, #endif }; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 64dc083a82cc..ba1901f5c828 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -27,7 +27,7 @@ enum { NM_INFO, /* struct f2fs_nm_info */ F2FS_SBI, /* struct f2fs_sb_info */ #ifdef CONFIG_F2FS_STAT_FS - STAT_INFO, /* struct f2fs_stat_info */ + STAT_INFO, /* struct f2fs_stat_info */ #endif #ifdef CONFIG_F2FS_FAULT_INJECTION FAULT_INFO_RATE, /* struct f2fs_fault_info */ @@ -713,7 +713,7 @@ static struct kobj_type f2fs_ktype = { }; static struct kset f2fs_kset = { - .kobj = {.ktype = &f2fs_ktype}, + .kobj = {.ktype = &f2fs_ktype}, }; static struct kobj_type f2fs_feat_ktype = { diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 4f6582ef7ee3..1b0736ce0918 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -175,8 +175,8 @@ const struct xattr_handler f2fs_xattr_trusted_handler = { const struct xattr_handler f2fs_xattr_advise_handler = { .name = F2FS_SYSTEM_ADVISE_NAME, .flags = F2FS_XATTR_INDEX_ADVISE, - .get = f2fs_xattr_advise_get, - .set = f2fs_xattr_advise_set, + .get = f2fs_xattr_advise_get, + .set = f2fs_xattr_advise_set, }; const struct xattr_handler f2fs_xattr_security_handler = { From 5a31735c48df3a679d283ac57909a116beb49609 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 24 Jul 2020 18:21:36 +0800 Subject: [PATCH 041/580] f2fs: compress: fix to update isize when overwriting compressed file We missed to update isize of compressed file in write_end() with below case: cluster size is 16KB - write 14KB data from offset 0 - overwrite 16KB data from offset 0 Fixes: 4c8ff7095bef ("f2fs: support data compression") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 14ab1692157f..24d44a9d021c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3475,6 +3475,10 @@ static int f2fs_write_end(struct file *file, if (f2fs_compressed_file(inode) && fsdata) { f2fs_compress_write_end(inode, fsdata, page->index, copied); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + + if (pos + copied > i_size_read(inode) && + !f2fs_verity_in_progress(inode)) + f2fs_i_size_write(inode, pos + copied); return copied; } #endif From 084396a1dd48a644bee09e43c3de981f73b30e92 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 25 Jul 2020 09:17:48 +0800 Subject: [PATCH 042/580] f2fs: compress: delay temp page allocation Currently, we allocate temp pages which is used to pad hole in cluster during read IO submission, it may take long time before releasing them in f2fs_decompress_pages(), since they are only used as temp output buffer in decompression context, so let's just do the allocation in that context to reduce time of memory pool resource occupation. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 5d0788953308..b9f204dd6548 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -641,6 +641,7 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) const struct f2fs_compress_ops *cops = f2fs_cops[fi->i_compress_algorithm]; int ret; + int i; dec_page_count(sbi, F2FS_RD_DATA); @@ -659,6 +660,26 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) goto out_free_dic; } + dic->tpages = f2fs_kzalloc(sbi, sizeof(struct page *) * + dic->cluster_size, GFP_NOFS); + if (!dic->tpages) { + ret = -ENOMEM; + goto out_free_dic; + } + + for (i = 0; i < dic->cluster_size; i++) { + if (dic->rpages[i]) { + dic->tpages[i] = dic->rpages[i]; + continue; + } + + dic->tpages[i] = f2fs_compress_alloc_page(); + if (!dic->tpages[i]) { + ret = -ENOMEM; + goto out_free_dic; + } + } + if (cops->init_decompress_ctx) { ret = cops->init_decompress_ctx(dic); if (ret) @@ -1420,22 +1441,6 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) dic->cpages[i] = page; } - dic->tpages = f2fs_kzalloc(sbi, sizeof(struct page *) * - dic->cluster_size, GFP_NOFS); - if (!dic->tpages) - goto out_free; - - for (i = 0; i < dic->cluster_size; i++) { - if (cc->rpages[i]) { - dic->tpages[i] = cc->rpages[i]; - continue; - } - - dic->tpages[i] = f2fs_compress_alloc_page(); - if (!dic->tpages[i]) - goto out_free; - } - return dic; out_free: From 6f7abfff26bc4605dae35970ac211ddd78fcd61a Mon Sep 17 00:00:00 2001 From: Jack Qiu Date: Sat, 25 Jul 2020 18:05:27 +0800 Subject: [PATCH 043/580] f2fs: correct comment of f2fs_exist_written_data Function parameter mode could be TRANS_DIR_INO. Signed-off-by: Jack Qiu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f86c6ba7cb82..8c782d3f324f 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -523,7 +523,7 @@ void f2fs_remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) __remove_ino_entry(sbi, ino, type); } -/* mode should be APPEND_INO or UPDATE_INO */ +/* mode should be APPEND_INO, UPDATE_INO or TRANS_DIR_INO */ bool f2fs_exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode) { struct inode_management *im = &sbi->im[mode]; From b9acb8c4d63c87381504ed1b78e8bc82276e1ec4 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 28 Jul 2020 08:26:11 -0700 Subject: [PATCH 044/580] f2fs: fix deadlock between quota writes and checkpoint f2fs_write_data_pages(quota_mapping) __f2fs_write_data_pages f2fs_write_checkpoint * blk_start_plug(&plug); * add bio in write_io[DATA] - block_operations - skip syncing quota by >DEFAULT_RETRY_QUOTA_FLUSH_COUNT - down_write(&sbi->node_write); - f2fs_write_single_data_page - down_read(node_write) - f2fs_wait_on_all_pages(F2FS_WB_CP_DATA); Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 8c782d3f324f..99c8061da55b 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1269,6 +1269,8 @@ void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type) if (type == F2FS_DIRTY_META) f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + else if (type == F2FS_WB_CP_DATA) + f2fs_submit_merged_write(sbi, DATA); io_schedule_timeout(DEFAULT_IO_TIMEOUT); } finish_wait(&sbi->cp_wait, &wait); From 27d28f07fba93e0da12123eba2024d296850abc1 Mon Sep 17 00:00:00 2001 From: Jack Qiu Date: Wed, 29 Jul 2020 09:29:01 +0800 Subject: [PATCH 045/580] f2fs: use macro instead of f2fs verity version Because fsverity_descriptor_location.version is constant, so use macro for better reading. Signed-off-by: Jack Qiu Signed-off-by: Jaegeuk Kim --- fs/f2fs/verity.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index 077ac835ec53..95be8222e2eb 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -29,6 +29,8 @@ #include "f2fs.h" #include "xattr.h" +#define F2FS_VERIFY_VER (1) + static inline loff_t f2fs_verity_metadata_pos(const struct inode *inode) { return round_up(inode->i_size, 65536); @@ -152,7 +154,7 @@ static int f2fs_end_enable_verity(struct file *filp, const void *desc, struct inode *inode = file_inode(filp); u64 desc_pos = f2fs_verity_metadata_pos(inode) + merkle_tree_size; struct fsverity_descriptor_location dloc = { - .version = cpu_to_le32(1), + .version = cpu_to_le32(F2FS_VERIFY_VER), .size = cpu_to_le32(desc_size), .pos = cpu_to_le64(desc_pos), }; @@ -199,7 +201,7 @@ static int f2fs_get_verity_descriptor(struct inode *inode, void *buf, F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc), NULL); if (res < 0 && res != -ERANGE) return res; - if (res != sizeof(dloc) || dloc.version != cpu_to_le32(1)) { + if (res != sizeof(dloc) || dloc.version != cpu_to_le32(F2FS_VERIFY_VER)) { f2fs_warn(F2FS_I_SB(inode), "unknown verity xattr format"); return -EINVAL; } From 7c5c66d2c6d6151a7417fddce851d4b8d2be080b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 29 Jul 2020 21:21:35 +0800 Subject: [PATCH 046/580] f2fs: compress: add sanity check during compressed cluster read In f2fs_read_multi_pages(), we don't have to check cluster's type again, since overwrite or partial truncation need page lock in cluster which has already been held by reader, so cluster's type is stable, let's change check condition to sanity check. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 24d44a9d021c..d84b0b447ce5 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2169,9 +2169,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, if (ret) goto out; - /* cluster was overwritten as normal cluster */ - if (dn.data_blkaddr != COMPRESS_ADDR) - goto out; + f2fs_bug_on(sbi, dn.data_blkaddr != COMPRESS_ADDR); for (i = 1; i < cc->cluster_size; i++) { block_t blkaddr; From dada001be95c416d5d1312eb512888d970a908ca Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 29 Jul 2020 21:21:36 +0800 Subject: [PATCH 047/580] f2fs: compress: disable compression mount option if compression is off If CONFIG_F2FS_FS_COMPRESSION is off, don't allow to configure or show compression related mount option. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d215c9f40d95..0f0317a9c0b2 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -459,9 +459,12 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) { struct f2fs_sb_info *sbi = F2FS_SB(sb); substring_t args[MAX_OPT_ARGS]; +#ifdef CONFIG_F2FS_FS_COMPRESSION unsigned char (*ext)[F2FS_EXTENSION_LEN]; + int ext_cnt; +#endif char *p, *name; - int arg = 0, ext_cnt; + int arg = 0; kuid_t uid; kgid_t gid; int ret; @@ -852,6 +855,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) case Opt_checkpoint_enable: clear_opt(sbi, DISABLE_CHECKPOINT); break; +#ifdef CONFIG_F2FS_FS_COMPRESSION case Opt_compress_algorithm: if (!f2fs_sb_has_compression(sbi)) { f2fs_err(sbi, "Compression feature if off"); @@ -914,6 +918,13 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) F2FS_OPTION(sbi).compress_ext_cnt++; kfree(name); break; +#else + case Opt_compress_algorithm: + case Opt_compress_log_size: + case Opt_compress_extension: + f2fs_info(sbi, "compression options not supported"); + break; +#endif default: f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value", p); @@ -1609,7 +1620,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_NOBARRIER) seq_printf(seq, ",fsync_mode=%s", "nobarrier"); +#ifdef CONFIG_F2FS_FS_COMPRESSION f2fs_show_compress_options(seq, sbi->sb); +#endif return 0; } From 01c6a8b4b283333d5e52e31a41c2f20cc7909959 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Thu, 30 Jul 2020 14:09:28 +0900 Subject: [PATCH 048/580] f2fs: make file immutable even if releasing zero compression block When we use F2FS_IOC_RELEASE_COMPRESS_BLOCKS ioctl, if we can't find any compressed blocks in the file even with large file size, the ioctl just ends up without changing the file's status as immutable. It makes the user, who expects that the file is immutable when it returns successfully, confused. Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a474799fb126..84784aabc749 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -3538,14 +3538,14 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) if (ret) goto out; - if (!F2FS_I(inode)->i_compr_blocks) - goto out; - F2FS_I(inode)->i_flags |= F2FS_IMMUTABLE_FL; f2fs_set_inode_flags(inode); inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, true); + if (!F2FS_I(inode)->i_compr_blocks) + goto out; + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); down_write(&F2FS_I(inode)->i_mmap_sem); From ae79f1841515cd9b80e38ffbdbe6a18d27638ed1 Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Fri, 31 Jul 2020 02:18:13 -0400 Subject: [PATCH 049/580] f2fs: replace test_and_set/clear_bit() with set/clear_bit() Since set/clear_inode_flag() don't need to return value to show if flag is set, we can just call set/clear_bit() here. Signed-off-by: Yufen Yu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c6e9064c68e8..1057552ce801 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2655,7 +2655,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode, static inline void set_inode_flag(struct inode *inode, int flag) { - test_and_set_bit(flag, F2FS_I(inode)->flags); + set_bit(flag, F2FS_I(inode)->flags); __mark_inode_dirty_flag(inode, flag, true); } @@ -2666,7 +2666,7 @@ static inline int is_inode_flag_set(struct inode *inode, int flag) static inline void clear_inode_flag(struct inode *inode, int flag) { - test_and_clear_bit(flag, F2FS_I(inode)->flags); + clear_bit(flag, F2FS_I(inode)->flags); __mark_inode_dirty_flag(inode, flag, false); } From 52ba8bd45e35f38a3fd565b78b07b22a2c860a86 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Sat, 1 Aug 2020 11:24:50 +0800 Subject: [PATCH 050/580] f2fs: update_sit_entry: Make the judgment condition of f2fs_bug_on more intuitive Current judgment condition of f2fs_bug_on in function update_sit_entry(): new_vblocks >> (sizeof(unsigned short) << 3) || new_vblocks > sbi->blocks_per_seg which equivalents to: new_vblocks < 0 || new_vblocks > sbi->blocks_per_seg The latter is more intuitive. Signed-off-by: Zhihao Cheng Reported-by: Jack Qiu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 543878c6222c..c0df36ebac62 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2165,7 +2165,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) new_vblocks = se->valid_blocks + del; offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); - f2fs_bug_on(sbi, (new_vblocks >> (sizeof(unsigned short) << 3) || + f2fs_bug_on(sbi, (new_vblocks < 0 || (new_vblocks > sbi->blocks_per_seg))); se->valid_blocks = new_vblocks; From 4efb07eff34dc1494ad32a21fd61f0cd9febc3ff Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 3 Aug 2020 19:37:12 -0700 Subject: [PATCH 051/580] f2fs: prepare a waiter before entering io_schedule This is to avoid sleep() in the waiter thread. [ 20.157753] ------------[ cut here ]------------ [ 20.158393] do not call blocking ops when !TASK_RUNNING; state=2 set at [<0000000096354225>] prepare_to_wait+0xcd/0x430 [ 20.159858] WARNING: CPU: 1 PID: 1152 at kernel/sched/core.c:7142 __might_sleep+0x149/0x1a0 ... [ 20.176110] __submit_merged_write_cond+0x191/0x310 [ 20.176739] f2fs_submit_merged_write+0x18/0x20 [ 20.177323] f2fs_wait_on_all_pages+0x269/0x2d0 [ 20.177899] ? block_operations+0x980/0x980 [ 20.178441] ? __kasan_check_read+0x11/0x20 [ 20.178975] ? finish_wait+0x260/0x260 [ 20.179488] ? percpu_counter_set+0x147/0x230 [ 20.180049] do_checkpoint+0x1757/0x2a50 [ 20.180558] f2fs_write_checkpoint+0x840/0xaf0 [ 20.181126] f2fs_sync_fs+0x287/0x4a0 Reported-by: Eric Biggers Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 99c8061da55b..ff807e14c891 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1258,8 +1258,6 @@ void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type) DEFINE_WAIT(wait); for (;;) { - prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE); - if (!get_pages(sbi, type)) break; @@ -1271,6 +1269,8 @@ void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type) FS_CP_META_IO); else if (type == F2FS_WB_CP_DATA) f2fs_submit_merged_write(sbi, DATA); + + prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE); io_schedule_timeout(DEFAULT_IO_TIMEOUT); } finish_wait(&sbi->cp_wait, &wait); From 98351bb2334b235f2f8e37ad8c1971ab7e2bad80 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Wed, 19 Aug 2020 10:34:48 +0900 Subject: [PATCH 052/580] f2fs: Fix type of section block count variables Commit da52f8ade40b ("f2fs: get the right gc victim section when section has several segments") added code to count blocks of each section using variables with type 'unsigned short', which has 2 bytes size in many systems. However, the counts can be larger than the 2 bytes range and type conversion results in wrong values. Especially when the f2fs sections have blocks as many as USHRT_MAX + 1, the count is handled as 0. This triggers eternal loop in init_dirty_segmap() at mount system call. Fix this by changing the type of the variables to block_t. Fixes: da52f8ade40b ("f2fs: get the right gc victim section when section has several segments") Signed-off-by: Shin'ichiro Kawasaki Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c0df36ebac62..68c6df4dd2f8 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -799,7 +799,7 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, if (__is_large_section(sbi)) { unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); - unsigned short valid_blocks = + block_t valid_blocks = get_valid_blocks(sbi, segno, true); f2fs_bug_on(sbi, unlikely(!valid_blocks || @@ -815,7 +815,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, enum dirty_type dirty_type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned short valid_blocks; + block_t valid_blocks; if (test_and_clear_bit(segno, dirty_i->dirty_segmap[dirty_type])) dirty_i->nr_dirty[dirty_type]--; @@ -4315,8 +4315,8 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct free_segmap_info *free_i = FREE_I(sbi); unsigned int segno = 0, offset = 0, secno; - unsigned short valid_blocks; - unsigned short blks_per_sec = BLKS_PER_SEC(sbi); + block_t valid_blocks; + block_t blks_per_sec = BLKS_PER_SEC(sbi); while (1) { /* find dirty segment based on free segmap */ From 2f02e2fd54e98423aada0947ac10f950531adb0e Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Tue, 18 Aug 2020 15:40:14 +0530 Subject: [PATCH 053/580] f2fs: fix indefinite loop scanning for free nid If the sbi->ckpt->next_free_nid is not NAT block aligned and if there are free nids in that NAT block between the start of the block and next_free_nid, then those free nids will not be scanned in scan_nat_page(). This results into mismatch between nm_i->available_nids and the sum of nm_i->free_nid_count of all NAT blocks scanned. And nm_i->available_nids will always be greater than the sum of free nids in all the blocks. Under this condition, if we use all the currently scanned free nids, then it will loop forever in f2fs_alloc_nid() as nm_i->available_nids is still not zero but nm_i->free_nid_count of that partially scanned NAT block is zero. Fix this to align the nm_i->next_scan_nid to the first nid of the corresponding NAT block. Signed-off-by: Sahitya Tummala Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index f1a9150add78..38ebfc83caf7 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2373,6 +2373,9 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi, if (unlikely(nid >= nm_i->max_nid)) nid = 0; + if (unlikely(nid % NAT_ENTRY_PER_BLOCK)) + nid = NAT_BLOCK_OFFSET(nid) * NAT_ENTRY_PER_BLOCK; + /* Enough entries */ if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK) return 0; From e1712134c01acfde2d20aea490ad3b3581ec8e0a Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Wed, 19 Aug 2020 16:07:31 -0400 Subject: [PATCH 054/580] f2fs: Return EOF on unaligned end of file DIO read Reading past end of file returns EOF for aligned reads but -EINVAL for unaligned reads on f2fs. While documentation is not strict about this corner case, most filesystem returns EOF on this case, like iomap filesystems. This patch consolidates the behavior for f2fs, by making it return EOF(0). it can be verified by a read loop on a file that does a partial read before EOF (A file that doesn't end at an aligned address). The following code fails on an unaligned file on f2fs, but not on btrfs, ext4, and xfs. while (done < total) { ssize_t delta = pread(fd, buf + done, total - done, off + done); if (!delta) break; ... } It is arguable whether filesystems should actually return EOF or -EINVAL, but since iomap filesystems support it, and so does the original DIO code, it seems reasonable to consolidate on that. Signed-off-by: Gabriel Krisman Bertazi Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d84b0b447ce5..f43ed6fb9c4a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3504,6 +3504,9 @@ static int check_direct_IO(struct inode *inode, struct iov_iter *iter, unsigned long align = offset | iov_iter_alignment(iter); struct block_device *bdev = inode->i_sb->s_bdev; + if (iov_iter_rw(iter) == READ && offset >= i_size_read(inode)) + return 1; + if (align & blocksize_mask) { if (bdev) blkbits = blksize_bits(bdev_logical_block_size(bdev)); From 318a62dab4bf37dab47dea15dae1be79ce86f03b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 11 Sep 2020 11:17:38 -0700 Subject: [PATCH 055/580] f2fs: update changes in upstream on GC_URGENT_HIGH This fixes the missing change. Note that, the below commit id is from mainline, original patch to catch up. Fixes: 0e5e81114de1 ("f2fs: add GC_URGENT_LOW mode in gc_urgent") Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index ba1901f5c828..c9ca43c55a53 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -357,7 +357,9 @@ static ssize_t __sbi_store(struct f2fs_attr *a, return -EINVAL; if (!strcmp(a->attr.name, "gc_urgent")) { - if (t == 1) { + if (t == 0) { + sbi->gc_mode = GC_NORMAL; + } else if (t == 1) { sbi->gc_mode = GC_URGENT_HIGH; if (sbi->gc_thread) { sbi->gc_thread->gc_wake = 1; @@ -368,7 +370,7 @@ static ssize_t __sbi_store(struct f2fs_attr *a, } else if (t == 2) { sbi->gc_mode = GC_URGENT_LOW; } else { - sbi->gc_mode = GC_NORMAL; + return -EINVAL; } return count; } From 9e7ce20e193049fa6a192ea902bd4ea94b650a55 Mon Sep 17 00:00:00 2001 From: Aravind Ramesh Date: Thu, 16 Jul 2020 18:26:56 +0530 Subject: [PATCH 056/580] f2fs: support zone capacity less than zone size NVMe Zoned Namespace devices can have zone-capacity less than zone-size. Zone-capacity indicates the maximum number of sectors that are usable in a zone beginning from the first sector of the zone. This makes the sectors sectors after the zone-capacity till zone-size to be unusable. This patch set tracks zone-size and zone-capacity in zoned devices and calculate the usable blocks per segment and usable segments per section. If zone-capacity is less than zone-size mark only those segments which start before zone-capacity as free segments. All segments at and beyond zone-capacity are treated as permanently used segments. In cases where zone-capacity does not align with segment size the last segment will start before zone-capacity and end beyond the zone-capacity of the zone. For such spanning segments only sectors within the zone-capacity are used. During writes and GC manage the usable segments in a section and usable blocks per segment. Segments which are beyond zone-capacity are never allocated, and do not need to be garbage collected, only the segments which are before zone-capacity needs to garbage collected. For spanning segments based on the number of usable blocks in that segment, write to blocks only up to zone-capacity. Zone-capacity is device specific and cannot be configured by the user. Since NVMe ZNS device zones are sequentially write only, a block device with conventional zones or any normal block device is needed along with the ZNS device for the metadata operations of F2fs. A typical nvme-cli output of a zoned device shows zone start and capacity and write pointer as below: SLBA: 0x0 WP: 0x0 Cap: 0x18800 State: EMPTY Type: SEQWRITE_REQ SLBA: 0x20000 WP: 0x20000 Cap: 0x18800 State: EMPTY Type: SEQWRITE_REQ SLBA: 0x40000 WP: 0x40000 Cap: 0x18800 State: EMPTY Type: SEQWRITE_REQ Here zone size is 64MB, capacity is 49MB, WP is at zone start as the zones are in EMPTY state. For each zone, only zone start + 49MB is usable area, any lba/sector after 49MB cannot be read or written to, the drive will fail any attempts to read/write. So, the second zone starts at 64MB and is usable till 113MB (64 + 49) and the range between 113 and 128MB is again unusable. The next zone starts at 128MB, and so on. Signed-off-by: Aravind Ramesh Signed-off-by: Damien Le Moal Signed-off-by: Niklas Cassel Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 +++ fs/f2fs/gc.c | 25 +++++++++++++----- fs/f2fs/gc.h | 44 ++++++++++++++++++++++++++++--- fs/f2fs/segment.c | 66 ++++++++++++++++++++++++++++++++++++++--------- fs/f2fs/segment.h | 26 ++++++++++++------- 5 files changed, 134 insertions(+), 31 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1057552ce801..4ebda759c738 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3382,6 +3382,10 @@ void f2fs_destroy_segment_manager_caches(void); int f2fs_rw_hint_to_seg_type(enum rw_hint hint); enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, enum page_type type, enum temp_type temp); +unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi, + unsigned int segno); +unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, + unsigned int segno); /* * checkpoint.c diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 11b4adde9baf..2232bcf344b6 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -266,13 +266,14 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) unsigned char age = 0; unsigned char u; unsigned int i; + unsigned int usable_segs_per_sec = f2fs_usable_segs_in_sec(sbi, segno); - for (i = 0; i < sbi->segs_per_sec; i++) + for (i = 0; i < usable_segs_per_sec; i++) mtime += get_seg_entry(sbi, start + i)->mtime; vblocks = get_valid_blocks(sbi, segno, true); - mtime = div_u64(mtime, sbi->segs_per_sec); - vblocks = div_u64(vblocks, sbi->segs_per_sec); + mtime = div_u64(mtime, usable_segs_per_sec); + vblocks = div_u64(vblocks, usable_segs_per_sec); u = (vblocks * 100) >> sbi->log_blocks_per_seg; @@ -536,6 +537,7 @@ static int gc_node_segment(struct f2fs_sb_info *sbi, int phase = 0; bool fggc = (gc_type == FG_GC); int submitted = 0; + unsigned int usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno); start_addr = START_BLOCK(sbi, segno); @@ -545,7 +547,7 @@ static int gc_node_segment(struct f2fs_sb_info *sbi, if (fggc && phase == 2) atomic_inc(&sbi->wb_sync_req[NODE]); - for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { + for (off = 0; off < usable_blks_in_seg; off++, entry++) { nid_t nid = le32_to_cpu(entry->nid); struct page *node_page; struct node_info ni; @@ -1033,13 +1035,14 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, int off; int phase = 0; int submitted = 0; + unsigned int usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno); start_addr = START_BLOCK(sbi, segno); next_step: entry = sum; - for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { + for (off = 0; off < usable_blks_in_seg; off++, entry++) { struct page *data_page; struct inode *inode; struct node_info dni; /* dnode info for the data */ @@ -1204,6 +1207,15 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, if (__is_large_section(sbi)) end_segno = rounddown(end_segno, sbi->segs_per_sec); + /* + * zone-capacity can be less than zone-size in zoned devices, + * resulting in less than expected usable segments in the zone, + * calculate the end segno in the zone which can be garbage collected + */ + if (f2fs_sb_has_blkzoned(sbi)) + end_segno -= sbi->segs_per_sec - + f2fs_usable_segs_in_sec(sbi, segno); + /* readahead multi ssa blocks those have contiguous address */ if (__is_large_section(sbi)) f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), @@ -1356,7 +1368,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, goto stop; seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type); - if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec) + if (gc_type == FG_GC && + seg_freed == f2fs_usable_segs_in_sec(sbi, segno)) sec_freed++; total_freed += seg_freed; diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index db3c61046aa4..ee5d7f30a1f8 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -44,13 +44,49 @@ struct gc_inode_list { /* * inline functions */ + +/* + * On a Zoned device zone-capacity can be less than zone-size and if + * zone-capacity is not aligned to f2fs segment size(2MB), then the segment + * starting just before zone-capacity has some blocks spanning across the + * zone-capacity, these blocks are not usable. + * Such spanning segments can be in free list so calculate the sum of usable + * blocks in currently free segments including normal and spanning segments. + */ +static inline block_t free_segs_blk_count_zoned(struct f2fs_sb_info *sbi) +{ + block_t free_seg_blks = 0; + struct free_segmap_info *free_i = FREE_I(sbi); + int j; + + spin_lock(&free_i->segmap_lock); + for (j = 0; j < MAIN_SEGS(sbi); j++) + if (!test_bit(j, free_i->free_segmap)) + free_seg_blks += f2fs_usable_blks_in_seg(sbi, j); + spin_unlock(&free_i->segmap_lock); + + return free_seg_blks; +} + +static inline block_t free_segs_blk_count(struct f2fs_sb_info *sbi) +{ + if (f2fs_sb_has_blkzoned(sbi)) + return free_segs_blk_count_zoned(sbi); + + return free_segments(sbi) << sbi->log_blocks_per_seg; +} + static inline block_t free_user_blocks(struct f2fs_sb_info *sbi) { - if (free_segments(sbi) < overprovision_segments(sbi)) + block_t free_blks, ovp_blks; + + free_blks = free_segs_blk_count(sbi); + ovp_blks = overprovision_segments(sbi) << sbi->log_blocks_per_seg; + + if (free_blks < ovp_blks) return 0; - else - return (free_segments(sbi) - overprovision_segments(sbi)) - << sbi->log_blocks_per_seg; + + return free_blks - ovp_blks; } static inline block_t limit_invalid_user_blocks(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 68c6df4dd2f8..1ffb2f2dd73c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -859,20 +859,22 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned short valid_blocks, ckpt_valid_blocks; + unsigned int usable_blocks; if (segno == NULL_SEGNO || IS_CURSEG(sbi, segno)) return; + usable_blocks = f2fs_usable_blks_in_seg(sbi, segno); mutex_lock(&dirty_i->seglist_lock); valid_blocks = get_valid_blocks(sbi, segno, false); ckpt_valid_blocks = get_ckpt_valid_blocks(sbi, segno); if (valid_blocks == 0 && (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) || - ckpt_valid_blocks == sbi->blocks_per_seg)) { + ckpt_valid_blocks == usable_blocks)) { __locate_dirty_segment(sbi, segno, PRE); __remove_dirty_segment(sbi, segno, DIRTY); - } else if (valid_blocks < sbi->blocks_per_seg) { + } else if (valid_blocks < usable_blocks) { __locate_dirty_segment(sbi, segno, DIRTY); } else { /* Recovery routine with SSR needs this */ @@ -915,9 +917,11 @@ block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi) for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { se = get_seg_entry(sbi, segno); if (IS_NODESEG(se->type)) - holes[NODE] += sbi->blocks_per_seg - se->valid_blocks; + holes[NODE] += f2fs_usable_blks_in_seg(sbi, segno) - + se->valid_blocks; else - holes[DATA] += sbi->blocks_per_seg - se->valid_blocks; + holes[DATA] += f2fs_usable_blks_in_seg(sbi, segno) - + se->valid_blocks; } mutex_unlock(&dirty_i->seglist_lock); @@ -2166,7 +2170,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); f2fs_bug_on(sbi, (new_vblocks < 0 || - (new_vblocks > sbi->blocks_per_seg))); + (new_vblocks > f2fs_usable_blks_in_seg(sbi, segno)))); se->valid_blocks = new_vblocks; se->mtime = get_mtime(sbi, false); @@ -2932,9 +2936,9 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); - if (curseg->next_blkoff < sbi->blocks_per_seg) - return true; - return false; + + return curseg->next_blkoff < f2fs_usable_blks_in_seg(sbi, + curseg->segno); } int f2fs_rw_hint_to_seg_type(enum rw_hint hint) @@ -4293,9 +4297,12 @@ static void init_free_segmap(struct f2fs_sb_info *sbi) { unsigned int start; int type; + struct seg_entry *sentry; for (start = 0; start < MAIN_SEGS(sbi); start++) { - struct seg_entry *sentry = get_seg_entry(sbi, start); + if (f2fs_usable_blks_in_seg(sbi, start) == 0) + continue; + sentry = get_seg_entry(sbi, start); if (!sentry->valid_blocks) __set_free(sbi, start); else @@ -4315,7 +4322,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct free_segmap_info *free_i = FREE_I(sbi); unsigned int segno = 0, offset = 0, secno; - block_t valid_blocks; + block_t valid_blocks, usable_blks_in_seg; block_t blks_per_sec = BLKS_PER_SEC(sbi); while (1) { @@ -4325,9 +4332,10 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) break; offset = segno + 1; valid_blocks = get_valid_blocks(sbi, segno, false); - if (valid_blocks == sbi->blocks_per_seg || !valid_blocks) + usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno); + if (valid_blocks == usable_blks_in_seg || !valid_blocks) continue; - if (valid_blocks > sbi->blocks_per_seg) { + if (valid_blocks > usable_blks_in_seg) { f2fs_bug_on(sbi, 1); continue; } @@ -4432,6 +4440,40 @@ static int sanity_check_curseg(struct f2fs_sb_info *sbi) return 0; } +{ + return 0; +} + +static inline unsigned int f2fs_usable_zone_blks_in_seg(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + return 0; +} + +static inline unsigned int f2fs_usable_zone_segs_in_sec(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + return 0; +} + +unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + if (f2fs_sb_has_blkzoned(sbi)) + return f2fs_usable_zone_blks_in_seg(sbi, segno); + + return sbi->blocks_per_seg; +} + +unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + if (f2fs_sb_has_blkzoned(sbi)) + return f2fs_usable_zone_segs_in_sec(sbi, segno); + + return sbi->segs_per_sec; +} + /* * Update min, max modified time for cost-benefit GC algorithm */ diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 752b177073b2..4bf5bdb3ea07 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -411,6 +411,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; + unsigned int usable_segs = f2fs_usable_segs_in_sec(sbi, segno); spin_lock(&free_i->segmap_lock); clear_bit(segno, free_i->free_segmap); @@ -418,7 +419,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) next = find_next_bit(free_i->free_segmap, start_segno + sbi->segs_per_sec, start_segno); - if (next >= start_segno + sbi->segs_per_sec) { + if (next >= start_segno + usable_segs) { clear_bit(secno, free_i->free_secmap); free_i->free_sections++; } @@ -444,6 +445,7 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi, unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; + unsigned int usable_segs = f2fs_usable_segs_in_sec(sbi, segno); spin_lock(&free_i->segmap_lock); if (test_and_clear_bit(segno, free_i->free_segmap)) { @@ -453,7 +455,7 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi, goto skip_free; next = find_next_bit(free_i->free_segmap, start_segno + sbi->segs_per_sec, start_segno); - if (next >= start_segno + sbi->segs_per_sec) { + if (next >= start_segno + usable_segs) { if (test_and_clear_bit(secno, free_i->free_secmap)) free_i->free_sections++; } @@ -546,8 +548,8 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi) /* check current node segment */ for (i = CURSEG_HOT_NODE; i <= CURSEG_COLD_NODE; i++) { segno = CURSEG_I(sbi, i)->segno; - left_blocks = sbi->blocks_per_seg - - get_seg_entry(sbi, segno)->ckpt_valid_blocks; + left_blocks = f2fs_usable_blks_in_seg(sbi, segno) - + get_seg_entry(sbi, segno)->ckpt_valid_blocks; if (node_blocks > left_blocks) return false; @@ -555,7 +557,7 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi) /* check current data segment */ segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno; - left_blocks = sbi->blocks_per_seg - + left_blocks = f2fs_usable_blks_in_seg(sbi, segno) - get_seg_entry(sbi, segno)->ckpt_valid_blocks; if (dent_blocks > left_blocks) return false; @@ -677,21 +679,22 @@ static inline int check_block_count(struct f2fs_sb_info *sbi, bool is_valid = test_bit_le(0, raw_sit->valid_map) ? true : false; int valid_blocks = 0; int cur_pos = 0, next_pos; + unsigned int usable_blks_per_seg = f2fs_usable_blks_in_seg(sbi, segno); /* check bitmap with valid block count */ do { if (is_valid) { next_pos = find_next_zero_bit_le(&raw_sit->valid_map, - sbi->blocks_per_seg, + usable_blks_per_seg, cur_pos); valid_blocks += next_pos - cur_pos; } else next_pos = find_next_bit_le(&raw_sit->valid_map, - sbi->blocks_per_seg, + usable_blks_per_seg, cur_pos); cur_pos = next_pos; is_valid = !is_valid; - } while (cur_pos < sbi->blocks_per_seg); + } while (cur_pos < usable_blks_per_seg); if (unlikely(GET_SIT_VBLOCKS(raw_sit) != valid_blocks)) { f2fs_err(sbi, "Mismatch valid blocks %d vs. %d", @@ -700,8 +703,13 @@ static inline int check_block_count(struct f2fs_sb_info *sbi, return -EFSCORRUPTED; } + if (usable_blks_per_seg < sbi->blocks_per_seg) + f2fs_bug_on(sbi, find_next_bit_le(&raw_sit->valid_map, + sbi->blocks_per_seg, + usable_blks_per_seg) != sbi->blocks_per_seg); + /* check segment usage, and check boundary of a given segment number */ - if (unlikely(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg + if (unlikely(GET_SIT_VBLOCKS(raw_sit) > usable_blks_per_seg || segno > TOTAL_SEGS(sbi) - 1)) { f2fs_err(sbi, "Wrong valid blocks %d or segno %u", GET_SIT_VBLOCKS(raw_sit), segno); From e7f60aac9fe20c0d614d0af01cd2635be8bd30c4 Mon Sep 17 00:00:00 2001 From: Xiaojun Wang Date: Tue, 4 Aug 2020 21:46:04 +0800 Subject: [PATCH 057/580] f2fs: remove duplicated type casting Since DUMMY_WRITTEN_PAGE and ATOMIC_WRITTEN_PAGE have already been converted as unsigned long type, we don't need do type casting again. Signed-off-by: Xiaojun Wang Reported-by: Jack Qiu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 4 ++-- fs/f2fs/segment.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f43ed6fb9c4a..1e5301c27dc1 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -488,7 +488,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, zero_user_segment(page, 0, PAGE_SIZE); SetPagePrivate(page); - set_page_private(page, (unsigned long)DUMMY_WRITTEN_PAGE); + set_page_private(page, DUMMY_WRITTEN_PAGE); lock_page(page); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) f2fs_bug_on(sbi, 1); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4ebda759c738..84f83b059564 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1302,9 +1302,9 @@ enum fsync_mode { #define DUMMY_WRITTEN_PAGE ((unsigned long)-2) #define IS_ATOMIC_WRITTEN_PAGE(page) \ - (page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE) + (page_private(page) == ATOMIC_WRITTEN_PAGE) #define IS_DUMMY_WRITTEN_PAGE(page) \ - (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) + (page_private(page) == DUMMY_WRITTEN_PAGE) #ifdef CONFIG_F2FS_IO_TRACE #define IS_IO_TRACED_PAGE(page) \ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1ffb2f2dd73c..89c5062fa0a7 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -189,7 +189,7 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) f2fs_trace_pid(page); - f2fs_set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE); + f2fs_set_page_private(page, ATOMIC_WRITTEN_PAGE); new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); From 358632d9476baf23e1a7a3a8831f5789eddb0dc4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 7 Aug 2020 09:23:10 +0800 Subject: [PATCH 058/580] f2fs: compress: remove unneeded code - f2fs_write_multi_pages - f2fs_compress_pages - init_compress_ctx - compress_pages - destroy_compress_ctx --- 1 - f2fs_write_compressed_pages - destroy_compress_ctx --- 2 destroy_compress_ctx() in f2fs_write_multi_pages() is redundant, remove it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index b9f204dd6548..ce44d685c663 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1359,9 +1359,6 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, struct writeback_control *wbc, enum iostat_type io_type) { - struct f2fs_inode_info *fi = F2FS_I(cc->inode); - const struct f2fs_compress_ops *cops = - f2fs_cops[fi->i_compress_algorithm]; int err; *submitted = 0; @@ -1376,7 +1373,6 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, err = f2fs_write_compressed_pages(cc, submitted, wbc, io_type); - cops->destroy_compress_ctx(cc); kfree(cc->cpages); cc->cpages = NULL; if (!err) From 7b93c4c7287fd05bee62e85737f155af7a977846 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 4 Aug 2020 21:14:45 +0800 Subject: [PATCH 059/580] f2fs: introduce inmem curseg Previous implementation of aligned pinfile allocation will: - allocate new segment on cold data log no matter whether last used segment is partially used or not, it makes IOs more random; - force concurrent cold data/GCed IO going into warm data area, it can make a bad effect on hot/cold data separation; In this patch, we introduce a new type of log named 'inmem curseg', the differents from normal curseg is: - it reuses existed segment type (CURSEG_XXX_NODE/DATA); - it only exists in memory, its segno, blkofs, summary will not b persisted into checkpoint area; With this new feature, we can enhance scalability of log, special allocators can be created for purposes: - pure lfs allocator for aligned pinfile allocation or file defragmentation - pure ssr allocator for later feature So that, let's update aligned pinfile allocation to use this new inmem curseg fwk. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 7 ++- fs/f2fs/debug.c | 6 ++- fs/f2fs/f2fs.h | 12 +++-- fs/f2fs/file.c | 5 +- fs/f2fs/gc.c | 2 +- fs/f2fs/segment.c | 109 +++++++++++++++++++++++++++++-------------- fs/f2fs/segment.h | 17 ++++--- fs/f2fs/super.c | 9 ++-- 8 files changed, 112 insertions(+), 55 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index ff807e14c891..9e30ff6414b8 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1619,11 +1619,16 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_flush_sit_entries(sbi, cpc); + /* save inmem log status */ + f2fs_save_inmem_curseg(sbi, CURSEG_COLD_DATA_PINNED); + err = do_checkpoint(sbi, cpc); if (err) f2fs_release_discard_addrs(sbi); else f2fs_clear_prefree_segments(sbi, cpc); + + f2fs_restore_inmem_curseg(sbi, CURSEG_COLD_DATA_PINNED); stop: unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); @@ -1654,7 +1659,7 @@ void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi) } sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS - - NR_CURSEG_TYPE - __cp_payload(sbi)) * + NR_CURSEG_PERSIST_TYPE - __cp_payload(sbi)) * F2FS_ORPHANS_PER_BLOCK; } diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 4276c0f79beb..41a91aa8c262 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -164,7 +164,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) / 2; si->util_invalid = 50 - si->util_free - si->util_valid; - for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_NODE; i++) { + for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); si->curseg[i] = curseg->segno; si->cursec[i] = GET_SEC_FROM_SEG(sbi, curseg->segno); @@ -393,6 +393,10 @@ static int stat_show(struct seq_file *s, void *v) si->dirty_seg[CURSEG_COLD_NODE], si->full_seg[CURSEG_COLD_NODE], si->valid_blks[CURSEG_COLD_NODE]); + seq_printf(s, " - Pinned file: %8d %8d %8d\n", + si->curseg[CURSEG_COLD_DATA_PINNED], + si->cursec[CURSEG_COLD_DATA_PINNED], + si->curzone[CURSEG_COLD_DATA_PINNED]); seq_printf(s, "\n - Valid: %d\n - Dirty: %d\n", si->main_area_segs - si->dirty_count - si->prefree_count - si->free_segs, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 84f83b059564..eb6067d43d5e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -972,7 +972,9 @@ static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode, */ #define NR_CURSEG_DATA_TYPE (3) #define NR_CURSEG_NODE_TYPE (3) -#define NR_CURSEG_TYPE (NR_CURSEG_DATA_TYPE + NR_CURSEG_NODE_TYPE) +#define NR_CURSEG_INMEM_TYPE (1) +#define NR_CURSEG_PERSIST_TYPE (NR_CURSEG_DATA_TYPE + NR_CURSEG_NODE_TYPE) +#define NR_CURSEG_TYPE (NR_CURSEG_INMEM_TYPE + NR_CURSEG_PERSIST_TYPE) enum { CURSEG_HOT_DATA = 0, /* directory entry blocks */ @@ -981,8 +983,10 @@ enum { CURSEG_HOT_NODE, /* direct node blocks of directory files */ CURSEG_WARM_NODE, /* direct node blocks of normal files */ CURSEG_COLD_NODE, /* indirect node blocks */ - NO_CHECK_TYPE, - CURSEG_COLD_DATA_PINNED,/* cold data for pinned file */ + NR_PERSISTENT_LOG, /* number of persistent log */ + CURSEG_COLD_DATA_PINNED = NR_PERSISTENT_LOG, + /* pinned file that needs consecutive block address */ + NO_CHECK_TYPE, /* number of persistent & inmem log */ }; struct flush_cmd { @@ -3338,6 +3342,8 @@ block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi); int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable); void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi); int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); +void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi, int type); +void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi, int type); void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, unsigned int start, unsigned int end); void f2fs_allocate_new_segment(struct f2fs_sb_info *sbi, int type); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 84784aabc749..a3c54f2baef8 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1659,13 +1659,14 @@ static int expand_inode_data(struct inode *inode, loff_t offset, } down_write(&sbi->pin_sem); - map.m_seg_type = CURSEG_COLD_DATA_PINNED; f2fs_lock_op(sbi); - f2fs_allocate_new_segment(sbi, CURSEG_COLD_DATA); + f2fs_allocate_new_segment(sbi, CURSEG_COLD_DATA_PINNED); f2fs_unlock_op(sbi); + map.m_seg_type = CURSEG_COLD_DATA_PINNED; err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); + up_write(&sbi->pin_sem); done += map.m_len; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 2232bcf344b6..4ac8ce6ac2a3 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1463,7 +1463,7 @@ static int free_segment_range(struct f2fs_sb_info *sbi, mutex_unlock(&DIRTY_I(sbi)->seglist_lock); /* Move out cursegs from the target range */ - for (type = CURSEG_HOT_DATA; type < NR_CURSEG_TYPE; type++) + for (type = CURSEG_HOT_DATA; type < NR_CURSEG_PERSIST_TYPE; type++) f2fs_allocate_segment_for_resize(sbi, type, start, end); /* do GC to move out valid blocks in the range */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 89c5062fa0a7..97e80678562f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1961,7 +1961,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) mutex_lock(&dirty_i->seglist_lock); for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi)) - __set_test_and_free(sbi, segno); + __set_test_and_free(sbi, segno, false); mutex_unlock(&dirty_i->seglist_lock); } @@ -2499,6 +2499,7 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) struct curseg_info *curseg = CURSEG_I(sbi, type); struct summary_footer *sum_footer; + curseg->inited = true; curseg->segno = curseg->next_segno; curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno); curseg->next_blkoff = 0; @@ -2506,24 +2507,31 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) sum_footer = &(curseg->sum_blk->footer); memset(sum_footer, 0, sizeof(struct summary_footer)); - if (IS_DATASEG(type)) + if (IS_DATASEG(curseg->seg_type)) SET_SUM_TYPE(sum_footer, SUM_TYPE_DATA); - if (IS_NODESEG(type)) + if (IS_NODESEG(curseg->seg_type)) SET_SUM_TYPE(sum_footer, SUM_TYPE_NODE); - __set_sit_entry_type(sbi, type, curseg->segno, modified); + __set_sit_entry_type(sbi, curseg->seg_type, curseg->segno, modified); } static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) { + struct curseg_info *curseg = CURSEG_I(sbi, type); + /* if segs_per_sec is large than 1, we need to keep original policy. */ if (__is_large_section(sbi)) - return CURSEG_I(sbi, type)->segno; + return curseg->segno; + + /* inmem log may not locate on any segment after mount */ + if (!curseg->inited) + return 0; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) return 0; if (test_opt(sbi, NOHEAP) && - (type == CURSEG_HOT_DATA || IS_NODESEG(type))) + (curseg->seg_type == CURSEG_HOT_DATA || + IS_NODESEG(curseg->seg_type))) return 0; if (SIT_I(sbi)->last_victim[ALLOC_NEXT]) @@ -2533,7 +2541,7 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE) return 0; - return CURSEG_I(sbi, type)->segno; + return curseg->segno; } /* @@ -2543,12 +2551,14 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) { struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned short seg_type = curseg->seg_type; unsigned int segno = curseg->segno; int dir = ALLOC_LEFT; - write_sum_page(sbi, curseg->sum_blk, + if (curseg->inited) + write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, segno)); - if (type == CURSEG_WARM_DATA || type == CURSEG_COLD_DATA) + if (seg_type == CURSEG_WARM_DATA || seg_type == CURSEG_COLD_DATA) dir = ALLOC_RIGHT; if (test_opt(sbi, NOHEAP)) @@ -2625,6 +2635,43 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type) f2fs_put_page(sum_page, 1); } +void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + + mutex_lock(&curseg->curseg_mutex); + if (!curseg->inited) + goto out; + + if (get_valid_blocks(sbi, curseg->segno, false)) { + write_sum_page(sbi, curseg->sum_blk, + GET_SUM_BLOCK(sbi, curseg->segno)); + } else { + mutex_lock(&DIRTY_I(sbi)->seglist_lock); + __set_test_and_free(sbi, curseg->segno, true); + mutex_unlock(&DIRTY_I(sbi)->seglist_lock); + } +out: + mutex_unlock(&curseg->curseg_mutex); +} + +void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + + mutex_lock(&curseg->curseg_mutex); + if (!curseg->inited) + goto out; + if (get_valid_blocks(sbi, curseg->segno, false)) + goto out; + + mutex_lock(&DIRTY_I(sbi)->seglist_lock); + __set_test_and_inuse(sbi, curseg->segno); + mutex_unlock(&DIRTY_I(sbi)->seglist_lock); +out: + mutex_unlock(&curseg->curseg_mutex); +} + static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -2741,11 +2788,15 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type) struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int old_segno; + if (!curseg->inited) + goto alloc; + if (!curseg->next_blkoff && !get_valid_blocks(sbi, curseg->segno, false) && !get_ckpt_valid_blocks(sbi, curseg->segno)) return; +alloc: old_segno = curseg->segno; SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true); locate_dirty_segment(sbi, old_segno); @@ -3129,19 +3180,6 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); - bool put_pin_sem = false; - - if (type == CURSEG_COLD_DATA) { - /* GC during CURSEG_COLD_DATA_PINNED allocation */ - if (down_read_trylock(&sbi->pin_sem)) { - put_pin_sem = true; - } else { - type = CURSEG_WARM_DATA; - curseg = CURSEG_I(sbi, type); - } - } else if (type == CURSEG_COLD_DATA_PINNED) { - type = CURSEG_COLD_DATA; - } down_read(&SM_I(sbi)->curseg_lock); @@ -3207,9 +3245,6 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, mutex_unlock(&curseg->curseg_mutex); up_read(&SM_I(sbi)->curseg_lock); - - if (put_pin_sem) - up_read(&sbi->pin_sem); } static void update_device_state(struct f2fs_io_info *fio) @@ -3577,7 +3612,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type - CURSEG_HOT_DATA]); if (__exist_node_summaries(sbi)) - blk_addr = sum_blk_addr(sbi, NR_CURSEG_TYPE, type); + blk_addr = sum_blk_addr(sbi, NR_CURSEG_PERSIST_TYPE, type); else blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type); } else { @@ -3655,8 +3690,9 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) } if (__exist_node_summaries(sbi)) - f2fs_ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_TYPE, type), - NR_CURSEG_TYPE - type, META_CP, true); + f2fs_ra_meta_pages(sbi, + sum_blk_addr(sbi, NR_CURSEG_PERSIST_TYPE, type), + NR_CURSEG_PERSIST_TYPE - type, META_CP, true); for (; type <= CURSEG_COLD_NODE; type++) { err = read_normal_summaries(sbi, type); @@ -4158,14 +4194,14 @@ static int build_curseg(struct f2fs_sb_info *sbi) struct curseg_info *array; int i; - array = f2fs_kzalloc(sbi, array_size(NR_CURSEG_TYPE, sizeof(*array)), - GFP_KERNEL); + array = f2fs_kzalloc(sbi, array_size(NR_CURSEG_TYPE, + sizeof(*array)), GFP_KERNEL); if (!array) return -ENOMEM; SM_I(sbi)->curseg_array = array; - for (i = 0; i < NR_CURSEG_TYPE; i++) { + for (i = 0; i < NO_CHECK_TYPE; i++) { mutex_init(&array[i].curseg_mutex); array[i].sum_blk = f2fs_kzalloc(sbi, PAGE_SIZE, GFP_KERNEL); if (!array[i].sum_blk) @@ -4175,8 +4211,13 @@ static int build_curseg(struct f2fs_sb_info *sbi) sizeof(struct f2fs_journal), GFP_KERNEL); if (!array[i].journal) return -ENOMEM; + if (i < NR_PERSISTENT_LOG) + array[i].seg_type = CURSEG_HOT_DATA + i; + else if (i == CURSEG_COLD_DATA_PINNED) + array[i].seg_type = CURSEG_COLD_DATA; array[i].segno = NULL_SEGNO; array[i].next_blkoff = 0; + array[i].inited = false; } return restore_curseg_summaries(sbi); } @@ -4415,7 +4456,7 @@ static int sanity_check_curseg(struct f2fs_sb_info *sbi) * In LFS/SSR curseg, .next_blkoff should point to an unused blkaddr; * In LFS curseg, all blkaddr after .next_blkoff should be unused. */ - for (i = 0; i < NO_CHECK_TYPE; i++) { + for (i = 0; i < NR_PERSISTENT_LOG; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); struct seg_entry *se = get_seg_entry(sbi, curseg->segno); unsigned int blkofs = curseg->next_blkoff; @@ -4440,10 +4481,6 @@ static int sanity_check_curseg(struct f2fs_sb_info *sbi) return 0; } -{ - return 0; -} - static inline unsigned int f2fs_usable_zone_blks_in_seg(struct f2fs_sb_info *sbi, unsigned int segno) { diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 4bf5bdb3ea07..ddaffe30f992 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -22,7 +22,7 @@ #define GET_R2L_SEGNO(free_i, segno) ((segno) + (free_i)->start_segno) #define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA) -#define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE) +#define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE && (t) <= CURSEG_COLD_NODE) #define IS_HOT(t) ((t) == CURSEG_HOT_NODE || (t) == CURSEG_HOT_DATA) #define IS_WARM(t) ((t) == CURSEG_WARM_NODE || (t) == CURSEG_WARM_DATA) @@ -34,7 +34,8 @@ ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ - ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) + ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno)) #define IS_CURSEC(sbi, secno) \ (((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ @@ -48,7 +49,9 @@ ((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \ (sbi)->segs_per_sec) || \ ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ - (sbi)->segs_per_sec)) \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno / \ + (sbi)->segs_per_sec)) #define MAIN_BLKADDR(sbi) \ (SM_I(sbi) ? SM_I(sbi)->main_blkaddr : \ @@ -288,10 +291,12 @@ struct curseg_info { struct rw_semaphore journal_rwsem; /* protect journal area */ struct f2fs_journal *journal; /* cached journal info */ unsigned char alloc_type; /* current allocation type */ + unsigned short seg_type; /* segment type like CURSEG_XXX_TYPE */ unsigned int segno; /* current segment number */ unsigned short next_blkoff; /* next block offset to write */ unsigned int zone; /* current zone number */ unsigned int next_segno; /* preallocated segment */ + bool inited; /* indicate inmem log is inited */ }; struct sit_entry_set { @@ -305,8 +310,6 @@ struct sit_entry_set { */ static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type) { - if (type == CURSEG_COLD_DATA_PINNED) - type = CURSEG_COLD_DATA; return (struct curseg_info *)(SM_I(sbi)->curseg_array + type); } @@ -439,7 +442,7 @@ static inline void __set_inuse(struct f2fs_sb_info *sbi, } static inline void __set_test_and_free(struct f2fs_sb_info *sbi, - unsigned int segno) + unsigned int segno, bool inmem) { struct free_segmap_info *free_i = FREE_I(sbi); unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); @@ -451,7 +454,7 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi, if (test_and_clear_bit(segno, free_i->free_segmap)) { free_i->free_segments++; - if (IS_CURSEC(sbi, secno)) + if (!inmem && IS_CURSEC(sbi, secno)) goto skip_free; next = find_next_bit(free_i->free_segmap, start_segno + sbi->segs_per_sec, start_segno); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 0f0317a9c0b2..4b6847923cf1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -577,7 +577,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) case Opt_active_logs: if (args->from && match_int(args, &arg)) return -EINVAL; - if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE) + if (arg != 2 && arg != 4 && + arg != NR_CURSEG_PERSIST_TYPE) return -EINVAL; F2FS_OPTION(sbi).active_logs = arg; break; @@ -988,7 +989,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) } /* Not pass down write hints if the number of active logs is lesser - * than NR_CURSEG_TYPE. + * than NR_CURSEG_PERSIST_TYPE. */ if (F2FS_OPTION(sbi).active_logs != NR_CURSEG_TYPE) F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; @@ -1629,7 +1630,7 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) static void default_options(struct f2fs_sb_info *sbi) { /* init some FS parameters */ - F2FS_OPTION(sbi).active_logs = NR_CURSEG_TYPE; + F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE; F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; @@ -2961,7 +2962,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) cp_payload = __cp_payload(sbi); if (cp_pack_start_sum < cp_payload + 1 || cp_pack_start_sum > blocks_per_seg - 1 - - NR_CURSEG_TYPE) { + NR_CURSEG_PERSIST_TYPE) { f2fs_err(sbi, "Wrong cp_pack_start_sum: %u", cp_pack_start_sum); return 1; From b1f5e8da6c040d417c8e82fc4ab631b4a30171ac Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 4 Aug 2020 21:14:46 +0800 Subject: [PATCH 060/580] f2fs: record average update time of segment Previously, once we update one block in segment, we will update mtime of segment to last time, making aged segment becoming freshest, result in that GC with cost benefit algorithm missing such segment, So this patch changes to record mtime as average block updating time instead of last updating time. It's not needed to reset mtime for prefree segment, as se->valid_blocks is zero, then old se->mtime won't take any weight with below calculation: se->mtime = div_u64(se->mtime * se->valid_blocks + mtime, se->valid_blocks + 1); Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 97e80678562f..33742a90e640 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2153,6 +2153,22 @@ static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type, __mark_sit_entry_dirty(sbi, segno); } +static void update_segment_mtime(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + unsigned int segno = GET_SEGNO(sbi, blkaddr); + struct seg_entry *se = get_seg_entry(sbi, segno); + unsigned long long mtime = get_mtime(sbi, false); + + if (!se->mtime) + se->mtime = mtime; + else + se->mtime = div_u64(se->mtime * se->valid_blocks + mtime, + se->valid_blocks + 1); + + if (mtime > SIT_I(sbi)->max_mtime) + SIT_I(sbi)->max_mtime = mtime; +} + static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) { struct seg_entry *se; @@ -2172,10 +2188,9 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) f2fs_bug_on(sbi, (new_vblocks < 0 || (new_vblocks > f2fs_usable_blks_in_seg(sbi, segno)))); + update_segment_mtime(sbi, blkaddr); + se->valid_blocks = new_vblocks; - se->mtime = get_mtime(sbi, false); - if (se->mtime > SIT_I(sbi)->max_mtime) - SIT_I(sbi)->max_mtime = se->mtime; /* Update valid block bitmap */ if (del > 0) { From 0b550776e16ae65bdfb1406a63b64ff250976a83 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 4 Aug 2020 21:14:47 +0800 Subject: [PATCH 061/580] f2fs: inherit mtime of original block during GC Don't let f2fs inner GC ruins original aging degree of segment. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 5 +++-- fs/f2fs/gc.c | 4 ++-- fs/f2fs/segment.c | 56 +++++++++++++++++++++++++++++++++++++---------- 4 files changed, 50 insertions(+), 17 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 1e5301c27dc1..8c7e3fb4f756 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1367,7 +1367,7 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); old_blkaddr = dn->data_blkaddr; f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr, - &sum, seg_type, NULL); + &sum, seg_type, NULL, false); if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) invalidate_mapping_pages(META_MAPPING(sbi), old_blkaddr, old_blkaddr); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index eb6067d43d5e..bc9a2c12753d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3362,7 +3362,8 @@ void f2fs_outplace_write_data(struct dnode_of_data *dn, int f2fs_inplace_write_data(struct f2fs_io_info *fio); void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t old_blkaddr, block_t new_blkaddr, - bool recover_curseg, bool recover_newaddr); + bool recover_curseg, bool recover_newaddr, + bool from_gc); void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, block_t old_addr, block_t new_addr, unsigned char version, bool recover_curseg, @@ -3370,7 +3371,7 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, - struct f2fs_io_info *fio); + struct f2fs_io_info *fio, bool from_gc); void f2fs_wait_on_page_writeback(struct page *page, enum page_type type, bool ordered, bool locked); void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 4ac8ce6ac2a3..697908c333e2 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -879,7 +879,7 @@ static int move_data_block(struct inode *inode, block_t bidx, } f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, - &sum, CURSEG_COLD_DATA, NULL); + &sum, CURSEG_COLD_DATA, NULL, true); fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi), newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS); @@ -929,7 +929,7 @@ static int move_data_block(struct inode *inode, block_t bidx, recover_block: if (err) f2fs_do_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr, - true, true); + true, true, true); up_out: if (lfs_mode) up_write(&fio.sbi->io_order_lock); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 33742a90e640..7b350132cf5b 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2153,11 +2153,28 @@ static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type, __mark_sit_entry_dirty(sbi, segno); } -static void update_segment_mtime(struct f2fs_sb_info *sbi, block_t blkaddr) +static inline unsigned long long get_segment_mtime(struct f2fs_sb_info *sbi, + block_t blkaddr) { unsigned int segno = GET_SEGNO(sbi, blkaddr); - struct seg_entry *se = get_seg_entry(sbi, segno); - unsigned long long mtime = get_mtime(sbi, false); + + if (segno == NULL_SEGNO) + return 0; + return get_seg_entry(sbi, segno)->mtime; +} + +static void update_segment_mtime(struct f2fs_sb_info *sbi, block_t blkaddr, + unsigned long long old_mtime) +{ + struct seg_entry *se; + unsigned int segno = GET_SEGNO(sbi, blkaddr); + unsigned long long ctime = get_mtime(sbi, false); + unsigned long long mtime = old_mtime ? old_mtime : ctime; + + if (segno == NULL_SEGNO) + return; + + se = get_seg_entry(sbi, segno); if (!se->mtime) se->mtime = mtime; @@ -2165,8 +2182,8 @@ static void update_segment_mtime(struct f2fs_sb_info *sbi, block_t blkaddr) se->mtime = div_u64(se->mtime * se->valid_blocks + mtime, se->valid_blocks + 1); - if (mtime > SIT_I(sbi)->max_mtime) - SIT_I(sbi)->max_mtime = mtime; + if (ctime > SIT_I(sbi)->max_mtime) + SIT_I(sbi)->max_mtime = ctime; } static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) @@ -2188,8 +2205,6 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) f2fs_bug_on(sbi, (new_vblocks < 0 || (new_vblocks > f2fs_usable_blks_in_seg(sbi, segno)))); - update_segment_mtime(sbi, blkaddr); - se->valid_blocks = new_vblocks; /* Update valid block bitmap */ @@ -2283,6 +2298,7 @@ void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) /* add it into sit main buffer */ down_write(&sit_i->sentry_lock); + update_segment_mtime(sbi, addr, 0); update_sit_entry(sbi, addr, -1); /* add it into dirty seglist */ @@ -3191,10 +3207,11 @@ static int __get_segment_type(struct f2fs_io_info *fio) void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, - struct f2fs_io_info *fio) + struct f2fs_io_info *fio, bool from_gc) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned long long old_mtime; down_read(&SM_I(sbi)->curseg_lock); @@ -3216,6 +3233,14 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, stat_inc_block_count(sbi, curseg); + if (from_gc) { + old_mtime = get_segment_mtime(sbi, old_blkaddr); + } else { + update_segment_mtime(sbi, old_blkaddr, 0); + old_mtime = 0; + } + update_segment_mtime(sbi, *new_blkaddr, old_mtime); + /* * SIT information should be updated before segment allocation, * since SSR needs latest valid block information. @@ -3292,7 +3317,8 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) down_read(&fio->sbi->io_order_lock); reallocate: f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, - &fio->new_blkaddr, sum, type, fio); + &fio->new_blkaddr, sum, type, fio, + is_cold_data(fio->page)); if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) invalidate_mapping_pages(META_MAPPING(fio->sbi), fio->old_blkaddr, fio->old_blkaddr); @@ -3408,7 +3434,8 @@ static inline int __f2fs_get_curseg(struct f2fs_sb_info *sbi, void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t old_blkaddr, block_t new_blkaddr, - bool recover_curseg, bool recover_newaddr) + bool recover_curseg, bool recover_newaddr, + bool from_gc) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg; @@ -3459,11 +3486,16 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); __add_sum_entry(sbi, type, sum); - if (!recover_curseg || recover_newaddr) + if (!recover_curseg || recover_newaddr) { + if (!from_gc) + update_segment_mtime(sbi, new_blkaddr, 0); update_sit_entry(sbi, new_blkaddr, 1); + } if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) { invalidate_mapping_pages(META_MAPPING(sbi), old_blkaddr, old_blkaddr); + if (!from_gc) + update_segment_mtime(sbi, old_blkaddr, 0); update_sit_entry(sbi, old_blkaddr, -1); } @@ -3495,7 +3527,7 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, set_summary(&sum, dn->nid, dn->ofs_in_node, version); f2fs_do_replace_block(sbi, &sum, old_addr, new_addr, - recover_curseg, recover_newaddr); + recover_curseg, recover_newaddr, false); f2fs_update_data_blkaddr(dn, new_addr); } From bcca6eeed7e8aa88f99eace0707ee25af7c71c16 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 4 Aug 2020 21:14:48 +0800 Subject: [PATCH 062/580] f2fs: support 64-bits key in f2fs rb-tree node entry then, we can add specified entry into rb-tree with 64-bits segment time as key. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 37 +++++++++++++++++++++++++++++++++++-- fs/f2fs/f2fs.h | 15 ++++++++++++--- fs/f2fs/segment.c | 4 ++-- 3 files changed, 49 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 686c68b98610..3ebf976a682d 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -58,6 +58,29 @@ struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root, return re; } +struct rb_node **f2fs_lookup_rb_tree_ext(struct f2fs_sb_info *sbi, + struct rb_root_cached *root, + struct rb_node **parent, + unsigned long long key, bool *leftmost) +{ + struct rb_node **p = &root->rb_root.rb_node; + struct rb_entry *re; + + while (*p) { + *parent = *p; + re = rb_entry(*parent, struct rb_entry, rb_node); + + if (key < re->key) { + p = &(*p)->rb_left; + } else { + p = &(*p)->rb_right; + *leftmost = false; + } + } + + return p; +} + struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, struct rb_root_cached *root, struct rb_node **parent, @@ -166,7 +189,7 @@ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, } bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, - struct rb_root_cached *root) + struct rb_root_cached *root, bool check_key) { #ifdef CONFIG_F2FS_CHECK_FS struct rb_node *cur = rb_first_cached(root), *next; @@ -183,13 +206,23 @@ bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, cur_re = rb_entry(cur, struct rb_entry, rb_node); next_re = rb_entry(next, struct rb_entry, rb_node); + if (check_key) { + if (cur_re->key > next_re->key) { + f2fs_info(sbi, "inconsistent rbtree, " + "cur(%llu) next(%llu)", + cur_re->key, next_re->key); + return false; + } + goto next; + } + if (cur_re->ofs + cur_re->len > next_re->ofs) { f2fs_info(sbi, "inconsistent rbtree, cur(%u, %u) next(%u, %u)", cur_re->ofs, cur_re->len, next_re->ofs, next_re->len); return false; } - +next: cur = next; } #endif diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index bc9a2c12753d..a03b908df229 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -611,8 +611,13 @@ enum { struct rb_entry { struct rb_node rb_node; /* rb node located in rb-tree */ - unsigned int ofs; /* start offset of the entry */ - unsigned int len; /* length of the entry */ + union { + struct { + unsigned int ofs; /* start offset of the entry */ + unsigned int len; /* length of the entry */ + }; + unsigned long long key; /* 64-bits key */ + }; }; struct extent_info { @@ -3812,6 +3817,10 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); */ struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root, struct rb_entry *cached_re, unsigned int ofs); +struct rb_node **f2fs_lookup_rb_tree_ext(struct f2fs_sb_info *sbi, + struct rb_root_cached *root, + struct rb_node **parent, + unsigned long long key, bool *left_most); struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, struct rb_root_cached *root, struct rb_node **parent, @@ -3822,7 +3831,7 @@ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, struct rb_node ***insert_p, struct rb_node **insert_parent, bool force, bool *leftmost); bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, - struct rb_root_cached *root); + struct rb_root_cached *root, bool check_key); unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); void f2fs_init_extent_tree(struct inode *inode, struct page *ipage); void f2fs_drop_extent_tree(struct inode *inode); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7b350132cf5b..16f2376d9b00 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1525,7 +1525,7 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, goto next; if (unlikely(dcc->rbtree_check)) f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, - &dcc->root)); + &dcc->root, false)); blk_start_plug(&plug); list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); @@ -2891,7 +2891,7 @@ static unsigned int __issue_discard_cmd_range(struct f2fs_sb_info *sbi, mutex_lock(&dcc->cmd_lock); if (unlikely(dcc->rbtree_check)) f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, - &dcc->root)); + &dcc->root, false)); dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, NULL, start, From 36b898628e8ead12867f698db6349335f965a681 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 10 Aug 2020 18:38:45 +0800 Subject: [PATCH 063/580] f2fs: fix compile warning This patch fixes below compile warning reported by LKP (kernel test robot) cppcheck warnings: (new ones prefixed by >>) >> fs/f2fs/file.c:761:9: warning: Identical condition 'err', second condition is always false [identicalConditionAfterEarlyExit] return err; ^ fs/f2fs/file.c:753:6: note: first condition if (err) ^ fs/f2fs/file.c:761:9: note: second condition return err; Reported-by: kernel test robot Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a3c54f2baef8..36a681fe1d80 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -753,11 +753,14 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock) return err; #ifdef CONFIG_F2FS_FS_COMPRESSION - if (from != free_from) + if (from != free_from) { err = f2fs_truncate_partial_cluster(inode, from, lock); + if (err) + return err; + } #endif - return err; + return 0; } int f2fs_truncate(struct inode *inode) From e4d2351af1ec4b27d14b26790dd347bd1c599514 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 10 Aug 2020 18:39:30 +0800 Subject: [PATCH 064/580] f2fs: compress: use more readable atomic_t type for {cic,dic}.ref refcount_t type variable should never be less than one, so it's a little bit hard to understand when we use it to indicate pending compressed page count, let's change to use atomic_t for better readability. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 10 +++++----- fs/f2fs/data.c | 6 +++--- fs/f2fs/f2fs.h | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index ce44d685c663..68cfdff85fee 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -648,7 +648,7 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) if (bio->bi_status || PageError(page)) dic->failed = true; - if (refcount_dec_not_one(&dic->ref)) + if (atomic_dec_return(&dic->pending_pages)) return; trace_f2fs_decompress_pages_start(dic->inode, dic->cluster_idx, @@ -717,7 +717,7 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) cops->destroy_decompress_ctx(dic); out_free_dic: if (verity) - refcount_set(&dic->ref, dic->nr_cpages); + atomic_set(&dic->pending_pages, dic->nr_cpages); if (!verity) f2fs_decompress_end_io(dic->rpages, dic->cluster_size, ret, false); @@ -1132,7 +1132,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, cic->magic = F2FS_COMPRESSED_PAGE_MAGIC; cic->inode = inode; - refcount_set(&cic->ref, cc->nr_cpages); + atomic_set(&cic->pending_pages, cc->nr_cpages); cic->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) << cc->log_cluster_size, GFP_NOFS); if (!cic->rpages) @@ -1267,7 +1267,7 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page) dec_page_count(sbi, F2FS_WB_DATA); - if (refcount_dec_not_one(&cic->ref)) + if (atomic_dec_return(&cic->pending_pages)) return; for (i = 0; i < cic->nr_rpages; i++) { @@ -1409,7 +1409,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) dic->magic = F2FS_COMPRESSED_PAGE_MAGIC; dic->inode = cc->inode; - refcount_set(&dic->ref, cc->nr_cpages); + atomic_set(&dic->pending_pages, cc->nr_cpages); dic->cluster_idx = cc->cluster_idx; dic->cluster_size = cc->cluster_size; dic->log_cluster_size = cc->log_cluster_size; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8c7e3fb4f756..4b4d14eb820c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -200,7 +200,7 @@ static void f2fs_verify_bio(struct bio *bio) dic = (struct decompress_io_ctx *)page_private(page); if (dic) { - if (refcount_dec_not_one(&dic->ref)) + if (atomic_dec_return(&dic->pending_pages)) continue; f2fs_verify_pages(dic->rpages, dic->cluster_size); @@ -2221,8 +2221,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, if (IS_ERR(bio)) { ret = PTR_ERR(bio); dic->failed = true; - if (refcount_sub_and_test(dic->nr_cpages - i, - &dic->ref)) { + if (!atomic_sub_return(dic->nr_cpages - i, + &dic->pending_pages)) { f2fs_decompress_end_io(dic->rpages, cc->cluster_size, true, false); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a03b908df229..5a0fcbdcc6d9 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1373,7 +1373,7 @@ struct compress_io_ctx { struct inode *inode; /* inode the context belong to */ struct page **rpages; /* pages store raw data in cluster */ unsigned int nr_rpages; /* total page number in rpages */ - refcount_t ref; /* referrence count of raw page */ + atomic_t pending_pages; /* in-flight compressed page count */ }; /* decompress io context for read IO path */ @@ -1392,7 +1392,7 @@ struct decompress_io_ctx { struct compress_data *cbuf; /* virtual mapped address on cpages */ size_t rlen; /* valid data length in rbuf */ size_t clen; /* valid data length in cbuf */ - refcount_t ref; /* referrence count of compressed page */ + atomic_t pending_pages; /* in-flight compressed page count */ bool failed; /* indicate IO error during decompression */ void *private; /* payload buffer for specified decompression algorithm */ void *private2; /* extra payload buffer */ From fe8bd7c5727e3714ca105c085ee603efc78560c2 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 8 Jul 2020 02:12:34 -0700 Subject: [PATCH 065/580] unicode: Add utf8_casefold_hash This adds a case insensitive hash function to allow taking the hash without needing to allocate a casefolded copy of the string. The existing d_hash implementations for casefolding allocate memory within rcu-walk, by avoiding it we can be more efficient and avoid worrying about a failed allocation. Signed-off-by: Daniel Rosenberg Reviewed-by: Gabriel Krisman Bertazi Reviewed-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/unicode/utf8-core.c | 23 ++++++++++++++++++++++- include/linux/unicode.h | 3 +++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c index 71ca4d047d65..313f9b81e852 100644 --- a/fs/unicode/utf8-core.c +++ b/fs/unicode/utf8-core.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "utf8n.h" @@ -122,9 +123,29 @@ int utf8_casefold(const struct unicode_map *um, const struct qstr *str, } return -EINVAL; } - EXPORT_SYMBOL(utf8_casefold); +int utf8_casefold_hash(const struct unicode_map *um, const void *salt, + struct qstr *str) +{ + const struct utf8data *data = utf8nfdicf(um->version); + struct utf8cursor cur; + int c; + unsigned long hash = init_name_hash(salt); + + if (utf8ncursor(&cur, data, str->name, str->len) < 0) + return -EINVAL; + + while ((c = utf8byte(&cur))) { + if (c < 0) + return -EINVAL; + hash = partial_name_hash((unsigned char)c, hash); + } + str->hash = end_name_hash(hash); + return 0; +} +EXPORT_SYMBOL(utf8_casefold_hash); + int utf8_normalize(const struct unicode_map *um, const struct qstr *str, unsigned char *dest, size_t dlen) { diff --git a/include/linux/unicode.h b/include/linux/unicode.h index 990aa97d8049..74484d44c755 100644 --- a/include/linux/unicode.h +++ b/include/linux/unicode.h @@ -27,6 +27,9 @@ int utf8_normalize(const struct unicode_map *um, const struct qstr *str, int utf8_casefold(const struct unicode_map *um, const struct qstr *str, unsigned char *dest, size_t dlen); +int utf8_casefold_hash(const struct unicode_map *um, const void *salt, + struct qstr *str); + struct unicode_map *utf8_load(const char *version); void utf8_unload(struct unicode_map *um); From ac770abf086def104842fbe552ff46155ff8d288 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 8 Jul 2020 02:12:35 -0700 Subject: [PATCH 066/580] fs: Add standard casefolding support This adds general supporting functions for filesystems that use utf8 casefolding. It provides standard dentry_operations and adds the necessary structures in struct super_block to allow this standardization. The new dentry operations are functionally equivalent to the existing operations in ext4 and f2fs, apart from the use of utf8_casefold_hash to avoid an allocation. By providing a common implementation, all users can benefit from any optimizations without needing to port over improvements. Signed-off-by: Daniel Rosenberg Reviewed-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/libfs.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/fs.h | 16 +++++++++ 2 files changed, 103 insertions(+) diff --git a/fs/libfs.c b/fs/libfs.c index 0fb590d79f30..aca24b05ca9e 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -16,6 +16,8 @@ #include #include #include /* sync_mapping_buffers */ +#include +#include #include @@ -1254,3 +1256,88 @@ bool is_empty_dir_inode(struct inode *inode) return (inode->i_fop == &empty_dir_operations) && (inode->i_op == &empty_dir_inode_operations); } + +#ifdef CONFIG_UNICODE +/* + * Determine if the name of a dentry should be casefolded. + * + * Return: if names will need casefolding + */ +static bool needs_casefold(const struct inode *dir) +{ + return IS_CASEFOLDED(dir) && dir->i_sb->s_encoding; +} + +/** + * generic_ci_d_compare - generic d_compare implementation for casefolding filesystems + * @dentry: dentry whose name we are checking against + * @len: len of name of dentry + * @str: str pointer to name of dentry + * @name: Name to compare against + * + * Return: 0 if names match, 1 if mismatch, or -ERRNO + */ +int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, + const char *str, const struct qstr *name) +{ + const struct dentry *parent = READ_ONCE(dentry->d_parent); + const struct inode *dir = READ_ONCE(parent->d_inode); + const struct super_block *sb = dentry->d_sb; + const struct unicode_map *um = sb->s_encoding; + struct qstr qstr = QSTR_INIT(str, len); + char strbuf[DNAME_INLINE_LEN]; + int ret; + + if (!dir || !needs_casefold(dir)) + goto fallback; + /* + * If the dentry name is stored in-line, then it may be concurrently + * modified by a rename. If this happens, the VFS will eventually retry + * the lookup, so it doesn't matter what ->d_compare() returns. + * However, it's unsafe to call utf8_strncasecmp() with an unstable + * string. Therefore, we have to copy the name into a temporary buffer. + */ + if (len <= DNAME_INLINE_LEN - 1) { + memcpy(strbuf, str, len); + strbuf[len] = 0; + qstr.name = strbuf; + /* prevent compiler from optimizing out the temporary buffer */ + barrier(); + } + ret = utf8_strncasecmp(um, name, &qstr); + if (ret >= 0) + return ret; + + if (sb_has_strict_encoding(sb)) + return -EINVAL; +fallback: + if (len != name->len) + return 1; + return !!memcmp(str, name->name, len); +} +EXPORT_SYMBOL(generic_ci_d_compare); + +/** + * generic_ci_d_hash - generic d_hash implementation for casefolding filesystems + * @dentry: dentry of the parent directory + * @str: qstr of name whose hash we should fill in + * + * Return: 0 if hash was successful or unchanged, and -EINVAL on error + */ +int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) +{ + const struct inode *dir = READ_ONCE(dentry->d_inode); + struct super_block *sb = dentry->d_sb; + const struct unicode_map *um = sb->s_encoding; + int ret = 0; + + if (!dir || !needs_casefold(dir)) + return 0; + + ret = utf8_casefold_hash(um, dentry, str); + if (ret < 0 && sb_has_strict_encoding(sb)) + return -EINVAL; + return 0; +} +EXPORT_SYMBOL(generic_ci_d_hash); +#endif diff --git a/include/linux/fs.h b/include/linux/fs.h index be331da42955..2d3382a8729d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1324,6 +1324,12 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_ACTIVE (1<<30) #define SB_NOUSER (1<<31) +/* These flags relate to encoding and casefolding */ +#define SB_ENC_STRICT_MODE_FL (1 << 0) + +#define sb_has_strict_encoding(sb) \ + (sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL) + /* * Umount options */ @@ -1391,6 +1397,10 @@ struct super_block { #endif #ifdef CONFIG_FS_VERITY const struct fsverity_operations *s_vop; +#endif +#ifdef CONFIG_UNICODE + struct unicode_map *s_encoding; + __u16 s_encoding_flags; #endif struct hlist_bl_head s_roots; /* alternate root dentries for NFS */ struct list_head s_mounts; /* list of mounts; _not_ for fs use */ @@ -3225,6 +3235,12 @@ extern int generic_file_fsync(struct file *, loff_t, loff_t, int); extern int generic_check_addressable(unsigned, u64); +#ifdef CONFIG_UNICODE +extern int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str); +extern int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, + const char *str, const struct qstr *name); +#endif + #ifdef CONFIG_MIGRATION extern int buffer_migrate_page(struct address_space *, struct page *, struct page *, From 1545d0af6e0aeeaec592ed6dabd9a41b3661acf7 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 8 Jul 2020 02:12:36 -0700 Subject: [PATCH 067/580] f2fs: Use generic casefolding support This switches f2fs over to the generic support provided in the previous patch. Since casefolded dentries behave the same in ext4 and f2fs, we decrease the maintenance burden by unifying them, and any optimizations will immediately apply to both. Signed-off-by: Daniel Rosenberg Reviewed-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 84 +++++------------------------------------ fs/f2fs/f2fs.h | 4 -- fs/f2fs/super.c | 10 ++--- fs/f2fs/sysfs.c | 10 +++-- include/linux/f2fs_fs.h | 3 -- 5 files changed, 20 insertions(+), 91 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index d559016668ab..f2f109493b4f 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -75,21 +75,22 @@ int f2fs_init_casefolded_name(const struct inode *dir, struct f2fs_filename *fname) { #ifdef CONFIG_UNICODE - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + struct super_block *sb = dir->i_sb; + struct f2fs_sb_info *sbi = F2FS_SB(sb); if (IS_CASEFOLDED(dir)) { fname->cf_name.name = f2fs_kmalloc(sbi, F2FS_NAME_LEN, GFP_NOFS); if (!fname->cf_name.name) return -ENOMEM; - fname->cf_name.len = utf8_casefold(sbi->s_encoding, + fname->cf_name.len = utf8_casefold(sb->s_encoding, fname->usr_fname, fname->cf_name.name, F2FS_NAME_LEN); if ((int)fname->cf_name.len <= 0) { kfree(fname->cf_name.name); fname->cf_name.name = NULL; - if (f2fs_has_strict_mode(sbi)) + if (sb_has_strict_encoding(sb)) return -EINVAL; /* fall back to treating name as opaque byte sequence */ } @@ -215,8 +216,8 @@ static struct f2fs_dir_entry *find_in_block(struct inode *dir, static bool f2fs_match_ci_name(const struct inode *dir, const struct qstr *name, const u8 *de_name, u32 de_name_len) { - const struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); - const struct unicode_map *um = sbi->s_encoding; + const struct super_block *sb = dir->i_sb; + const struct unicode_map *um = sb->s_encoding; struct qstr entry = QSTR_INIT(de_name, de_name_len); int res; @@ -226,7 +227,7 @@ static bool f2fs_match_ci_name(const struct inode *dir, const struct qstr *name, * In strict mode, ignore invalid names. In non-strict mode, * fall back to treating them as opaque byte sequences. */ - if (f2fs_has_strict_mode(sbi) || name->len != entry.len) + if (sb_has_strict_encoding(sb) || name->len != entry.len) return false; return !memcmp(name->name, entry.name, name->len); } @@ -1107,75 +1108,8 @@ const struct file_operations f2fs_dir_operations = { }; #ifdef CONFIG_UNICODE -static int f2fs_d_compare(const struct dentry *dentry, unsigned int len, - const char *str, const struct qstr *name) -{ - const struct dentry *parent = READ_ONCE(dentry->d_parent); - const struct inode *dir = READ_ONCE(parent->d_inode); - const struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); - struct qstr entry = QSTR_INIT(str, len); - char strbuf[DNAME_INLINE_LEN]; - int res; - - if (!dir || !IS_CASEFOLDED(dir)) - goto fallback; - - /* - * If the dentry name is stored in-line, then it may be concurrently - * modified by a rename. If this happens, the VFS will eventually retry - * the lookup, so it doesn't matter what ->d_compare() returns. - * However, it's unsafe to call utf8_strncasecmp() with an unstable - * string. Therefore, we have to copy the name into a temporary buffer. - */ - if (len <= DNAME_INLINE_LEN - 1) { - memcpy(strbuf, str, len); - strbuf[len] = 0; - entry.name = strbuf; - /* prevent compiler from optimizing out the temporary buffer */ - barrier(); - } - - res = utf8_strncasecmp(sbi->s_encoding, name, &entry); - if (res >= 0) - return res; - - if (f2fs_has_strict_mode(sbi)) - return -EINVAL; -fallback: - if (len != name->len) - return 1; - return !!memcmp(str, name->name, len); -} - -static int f2fs_d_hash(const struct dentry *dentry, struct qstr *str) -{ - struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); - const struct unicode_map *um = sbi->s_encoding; - const struct inode *inode = READ_ONCE(dentry->d_inode); - unsigned char *norm; - int len, ret = 0; - - if (!inode || !IS_CASEFOLDED(inode)) - return 0; - - norm = f2fs_kmalloc(sbi, PATH_MAX, GFP_ATOMIC); - if (!norm) - return -ENOMEM; - - len = utf8_casefold(um, str, norm, PATH_MAX); - if (len < 0) { - if (f2fs_has_strict_mode(sbi)) - ret = -EINVAL; - goto out; - } - str->hash = full_name_hash(dentry, norm, len); -out: - kvfree(norm); - return ret; -} - const struct dentry_operations f2fs_dentry_ops = { - .d_hash = f2fs_d_hash, - .d_compare = f2fs_d_compare, + .d_hash = generic_ci_d_hash, + .d_compare = generic_ci_d_compare, }; #endif diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5a0fcbdcc6d9..f313f097cf46 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1411,10 +1411,6 @@ struct f2fs_sb_info { int valid_super_block; /* valid super block no */ unsigned long s_flag; /* flags for sbi */ struct mutex writepages; /* mutex for writepages() */ -#ifdef CONFIG_UNICODE - struct unicode_map *s_encoding; - __u16 s_encoding_flags; -#endif #ifdef CONFIG_BLK_DEV_ZONED unsigned int blocks_per_blkz; /* F2FS blocks per zone */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4b6847923cf1..b5826f674a9a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1276,7 +1276,7 @@ static void f2fs_put_super(struct super_block *sb) for (i = 0; i < NR_PAGE_TYPE; i++) kvfree(sbi->write_io[i]); #ifdef CONFIG_UNICODE - utf8_unload(sbi->s_encoding); + utf8_unload(sb->s_encoding); #endif kfree(sbi); } @@ -3317,7 +3317,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) static int f2fs_setup_casefold(struct f2fs_sb_info *sbi) { #ifdef CONFIG_UNICODE - if (f2fs_sb_has_casefold(sbi) && !sbi->s_encoding) { + if (f2fs_sb_has_casefold(sbi) && !sbi->sb->s_encoding) { const struct f2fs_sb_encodings *encoding_info; struct unicode_map *encoding; __u16 encoding_flags; @@ -3348,8 +3348,8 @@ static int f2fs_setup_casefold(struct f2fs_sb_info *sbi) "%s-%s with flags 0x%hx", encoding_info->name, encoding_info->version?:"\b", encoding_flags); - sbi->s_encoding = encoding; - sbi->s_encoding_flags = encoding_flags; + sbi->sb->s_encoding = encoding; + sbi->sb->s_encoding_flags = encoding_flags; sbi->sb->s_d_op = &f2fs_dentry_ops; } #else @@ -3834,7 +3834,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) kvfree(sbi->write_io[i]); #ifdef CONFIG_UNICODE - utf8_unload(sbi->s_encoding); + utf8_unload(sb->s_encoding); #endif free_options: #ifdef CONFIG_QUOTA diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index c9ca43c55a53..556a97be5b82 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -176,12 +176,14 @@ static ssize_t encoding_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { #ifdef CONFIG_UNICODE + struct super_block *sb = sbi->sb; + if (f2fs_sb_has_casefold(sbi)) return snprintf(buf, PAGE_SIZE, "%s (%d.%d.%d)\n", - sbi->s_encoding->charset, - (sbi->s_encoding->version >> 16) & 0xff, - (sbi->s_encoding->version >> 8) & 0xff, - sbi->s_encoding->version & 0xff); + sb->s_encoding->charset, + (sb->s_encoding->version >> 16) & 0xff, + (sb->s_encoding->version >> 8) & 0xff, + sb->s_encoding->version & 0xff); #endif return sprintf(buf, "(none)"); } diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 3c383ddd92dd..a5dbb57a687f 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -38,9 +38,6 @@ #define F2FS_MAX_QUOTAS 3 #define F2FS_ENC_UTF8_12_1 1 -#define F2FS_ENC_STRICT_MODE_FL (1 << 0) -#define f2fs_has_strict_mode(sbi) \ - (sbi->s_encoding_flags & F2FS_ENC_STRICT_MODE_FL) #define F2FS_IO_SIZE(sbi) (1 << F2FS_OPTION(sbi).write_io_size_bits) /* Blocks */ #define F2FS_IO_SIZE_KB(sbi) (1 << (F2FS_OPTION(sbi).write_io_size_bits + 2)) /* KB */ From 12b887c0575b69c84fd01a29b231c520f83dab57 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 4 Aug 2020 21:14:49 +0800 Subject: [PATCH 068/580] f2fs: support age threshold based garbage collection There are several issues in current background GC algorithm: - valid blocks is one of key factors during cost overhead calculation, so if segment has less valid block, however even its age is young or it locates hot segment, CB algorithm will still choose the segment as victim, it's not appropriate. - GCed data/node will go to existing logs, no matter in-there datas' update frequency is the same or not, it may mix hot and cold data again. - GC alloctor mainly use LFS type segment, it will cost free segment more quickly. This patch introduces a new algorithm named age threshold based garbage collection to solve above issues, there are three steps mainly: 1. select a source victim: - set an age threshold, and select candidates beased threshold: e.g. 0 means youngest, 100 means oldest, if we set age threshold to 80 then select dirty segments which has age in range of [80, 100] as candiddates; - set candidate_ratio threshold, and select candidates based the ratio, so that we can shrink candidates to those oldest segments; - select target segment with fewest valid blocks in order to migrate blocks with minimum cost; 2. select a target victim: - select candidates beased age threshold; - set candidate_radius threshold, search candidates whose age is around source victims, searching radius should less than the radius threshold. - select target segment with most valid blocks in order to avoid migrating current target segment. 3. merge valid blocks from source victim into target victim with SSR alloctor. Test steps: - create 160 dirty segments: * half of them have 128 valid blocks per segment * left of them have 384 valid blocks per segment - run background GC Benefit: GC count and block movement count both decrease obviously: - Before: - Valid: 86 - Dirty: 1 - Prefree: 11 - Free: 6001 (6001) GC calls: 162 (BG: 220) - data segments : 160 (160) - node segments : 2 (2) Try to move 41454 blocks (BG: 41454) - data blocks : 40960 (40960) - node blocks : 494 (494) IPU: 0 blocks SSR: 0 blocks in 0 segments LFS: 41364 blocks in 81 segments - After: - Valid: 87 - Dirty: 0 - Prefree: 4 - Free: 6008 (6008) GC calls: 75 (BG: 76) - data segments : 74 (74) - node segments : 1 (1) Try to move 12813 blocks (BG: 12813) - data blocks : 12544 (12544) - node blocks : 269 (269) IPU: 0 blocks SSR: 12032 blocks in 77 segments LFS: 855 blocks in 2 segments Signed-off-by: Chao Yu [Jaegeuk Kim: fix a bug along with pinfile in-mem segment] Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 3 +- fs/f2fs/checkpoint.c | 4 +- fs/f2fs/data.c | 2 +- fs/f2fs/debug.c | 4 + fs/f2fs/f2fs.h | 29 +- fs/f2fs/gc.c | 380 +++++++++++++++++++++++- fs/f2fs/gc.h | 25 ++ fs/f2fs/segment.c | 177 ++++++++--- fs/f2fs/segment.h | 25 +- fs/f2fs/super.c | 26 +- fs/f2fs/sysfs.c | 11 +- include/trace/events/f2fs.h | 8 +- 12 files changed, 630 insertions(+), 64 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 7f730c4c8df2..834d0becae6d 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -22,7 +22,8 @@ Contact: "Namjae Jeon" Description: Controls the victim selection policy for garbage collection. Setting gc_idle = 0(default) will disable this option. Setting gc_idle = 1 will select the Cost Benefit approach & setting - gc_idle = 2 will select the greedy approach. + gc_idle = 2 will select the greedy approach & setting + gc_idle = 3 will select the age-threshold based approach. What: /sys/fs/f2fs//reclaim_segments Date: October 2013 diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 9e30ff6414b8..6059ce3758d8 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1620,7 +1620,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_flush_sit_entries(sbi, cpc); /* save inmem log status */ - f2fs_save_inmem_curseg(sbi, CURSEG_COLD_DATA_PINNED); + f2fs_save_inmem_curseg(sbi); err = do_checkpoint(sbi, cpc); if (err) @@ -1628,7 +1628,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) else f2fs_clear_prefree_segments(sbi, cpc); - f2fs_restore_inmem_curseg(sbi, CURSEG_COLD_DATA_PINNED); + f2fs_restore_inmem_curseg(sbi); stop: unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 4b4d14eb820c..2a04068783c6 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1367,7 +1367,7 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); old_blkaddr = dn->data_blkaddr; f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr, - &sum, seg_type, NULL, false); + &sum, seg_type, NULL); if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) invalidate_mapping_pages(META_MAPPING(sbi), old_blkaddr, old_blkaddr); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 41a91aa8c262..cb679561f44d 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -397,6 +397,10 @@ static int stat_show(struct seq_file *s, void *v) si->curseg[CURSEG_COLD_DATA_PINNED], si->cursec[CURSEG_COLD_DATA_PINNED], si->curzone[CURSEG_COLD_DATA_PINNED]); + seq_printf(s, " - ATGC data: %8d %8d %8d\n", + si->curseg[CURSEG_ALL_DATA_ATGC], + si->cursec[CURSEG_ALL_DATA_ATGC], + si->curzone[CURSEG_ALL_DATA_ATGC]); seq_printf(s, "\n - Valid: %d\n - Dirty: %d\n", si->main_area_segs - si->dirty_count - si->prefree_count - si->free_segs, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f313f097cf46..2dcfea53c68f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -97,6 +97,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_RESERVE_ROOT 0x01000000 #define F2FS_MOUNT_DISABLE_CHECKPOINT 0x02000000 #define F2FS_MOUNT_NORECOVERY 0x04000000 +#define F2FS_MOUNT_ATGC 0x08000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) @@ -977,7 +978,7 @@ static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode, */ #define NR_CURSEG_DATA_TYPE (3) #define NR_CURSEG_NODE_TYPE (3) -#define NR_CURSEG_INMEM_TYPE (1) +#define NR_CURSEG_INMEM_TYPE (2) #define NR_CURSEG_PERSIST_TYPE (NR_CURSEG_DATA_TYPE + NR_CURSEG_NODE_TYPE) #define NR_CURSEG_TYPE (NR_CURSEG_INMEM_TYPE + NR_CURSEG_PERSIST_TYPE) @@ -991,6 +992,7 @@ enum { NR_PERSISTENT_LOG, /* number of persistent log */ CURSEG_COLD_DATA_PINNED = NR_PERSISTENT_LOG, /* pinned file that needs consecutive block address */ + CURSEG_ALL_DATA_ATGC, /* SSR alloctor in hot/warm/cold data area */ NO_CHECK_TYPE, /* number of persistent & inmem log */ }; @@ -1236,6 +1238,18 @@ struct inode_management { unsigned long ino_num; /* number of entries */ }; +/* for GC_AT */ +struct atgc_management { + bool atgc_enabled; /* ATGC is enabled or not */ + struct rb_root_cached root; /* root of victim rb-tree */ + struct list_head victim_list; /* linked with all victim entries */ + unsigned int victim_count; /* victim count in rb-tree */ + unsigned int candidate_ratio; /* candidate ratio */ + unsigned int max_candidate_count; /* max candidate count */ + unsigned int age_weight; /* age weight, vblock_weight = 100 - age_weight */ + unsigned long long age_threshold; /* age threshold */ +}; + /* For s_flag in struct f2fs_sb_info */ enum { SBI_IS_DIRTY, /* dirty flag for checkpoint */ @@ -1268,6 +1282,7 @@ enum { GC_NORMAL, GC_IDLE_CB, GC_IDLE_GREEDY, + GC_IDLE_AT, GC_URGENT_HIGH, GC_URGENT_LOW, }; @@ -1518,6 +1533,7 @@ struct f2fs_sb_info { * race between GC and GC or CP */ struct f2fs_gc_kthread *gc_thread; /* GC thread */ + struct atgc_management am; /* atgc management */ unsigned int cur_victim_sec; /* current victim section num */ unsigned int gc_mode; /* current GC state */ unsigned int next_victim_seg[2]; /* next segment in victim section */ @@ -3343,8 +3359,11 @@ block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi); int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable); void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi); int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); -void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi, int type); -void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi, int type); +void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi); +void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi); +void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi); +void f2fs_get_new_segment(struct f2fs_sb_info *sbi, + unsigned int *newseg, bool new_sec, int dir); void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, unsigned int start, unsigned int end); void f2fs_allocate_new_segment(struct f2fs_sb_info *sbi, int type); @@ -3372,7 +3391,7 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, - struct f2fs_io_info *fio, bool from_gc); + struct f2fs_io_info *fio); void f2fs_wait_on_page_writeback(struct page *page, enum page_type type, bool ordered, bool locked); void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr); @@ -3512,6 +3531,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, unsigned int segno); void f2fs_build_gc_manager(struct f2fs_sb_info *sbi); int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count); +int __init f2fs_create_garbage_collection_cache(void); +void f2fs_destroy_garbage_collection_cache(void); /* * recovery.c diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 697908c333e2..84b9dac942e3 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -21,6 +21,8 @@ #include "gc.h" #include +static struct kmem_cache *victim_entry_slab; + static unsigned int count_bits(const unsigned long *addr, unsigned int offset, unsigned int len); @@ -169,7 +171,16 @@ void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi) static int select_gc_type(struct f2fs_sb_info *sbi, int gc_type) { - int gc_mode = (gc_type == BG_GC) ? GC_CB : GC_GREEDY; + int gc_mode; + + if (gc_type == BG_GC) { + if (sbi->am.atgc_enabled) + gc_mode = GC_AT; + else + gc_mode = GC_CB; + } else { + gc_mode = GC_GREEDY; + } switch (sbi->gc_mode) { case GC_IDLE_CB: @@ -179,7 +190,11 @@ static int select_gc_type(struct f2fs_sb_info *sbi, int gc_type) case GC_URGENT_HIGH: gc_mode = GC_GREEDY; break; + case GC_IDLE_AT: + gc_mode = GC_AT; + break; } + return gc_mode; } @@ -193,6 +208,11 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, p->dirty_bitmap = dirty_i->dirty_segmap[type]; p->max_search = dirty_i->nr_dirty[type]; p->ofs_unit = 1; + } else if (p->alloc_mode == AT_SSR) { + p->gc_mode = GC_GREEDY; + p->dirty_bitmap = dirty_i->dirty_segmap[type]; + p->max_search = dirty_i->nr_dirty[type]; + p->ofs_unit = 1; } else { p->gc_mode = select_gc_type(sbi, gc_type); p->ofs_unit = sbi->segs_per_sec; @@ -212,6 +232,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, */ if (gc_type != FG_GC && (sbi->gc_mode != GC_URGENT_HIGH) && + (p->gc_mode != GC_AT && p->alloc_mode != AT_SSR) && p->max_search > sbi->max_victim_search) p->max_search = sbi->max_victim_search; @@ -229,10 +250,16 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi, /* SSR allocates in a segment unit */ if (p->alloc_mode == SSR) return sbi->blocks_per_seg; + else if (p->alloc_mode == AT_SSR) + return UINT_MAX; + + /* LFS */ if (p->gc_mode == GC_GREEDY) return 2 * sbi->blocks_per_seg * p->ofs_unit; else if (p->gc_mode == GC_CB) return UINT_MAX; + else if (p->gc_mode == GC_AT) + return UINT_MAX; else /* No other gc_mode */ return 0; } @@ -298,8 +325,11 @@ static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, /* alloc_mode == LFS */ if (p->gc_mode == GC_GREEDY) return get_valid_blocks(sbi, segno, true); - else + else if (p->gc_mode == GC_CB) return get_cb_cost(sbi, segno); + + f2fs_bug_on(sbi, 1); + return 0; } static unsigned int count_bits(const unsigned long *addr, @@ -314,6 +344,273 @@ static unsigned int count_bits(const unsigned long *addr, return sum; } +static struct victim_entry *attach_victim_entry(struct f2fs_sb_info *sbi, + unsigned long long mtime, unsigned int segno, + struct rb_node *parent, struct rb_node **p, + bool left_most) +{ + struct atgc_management *am = &sbi->am; + struct victim_entry *ve; + + ve = f2fs_kmem_cache_alloc(victim_entry_slab, GFP_NOFS); + + ve->mtime = mtime; + ve->segno = segno; + + rb_link_node(&ve->rb_node, parent, p); + rb_insert_color_cached(&ve->rb_node, &am->root, left_most); + + list_add_tail(&ve->list, &am->victim_list); + + am->victim_count++; + + return ve; +} + +static void insert_victim_entry(struct f2fs_sb_info *sbi, + unsigned long long mtime, unsigned int segno) +{ + struct atgc_management *am = &sbi->am; + struct rb_node **p; + struct rb_node *parent = NULL; + bool left_most = true; + + p = f2fs_lookup_rb_tree_ext(sbi, &am->root, &parent, mtime, &left_most); + attach_victim_entry(sbi, mtime, segno, parent, p, left_most); +} + +static void add_victim_entry(struct f2fs_sb_info *sbi, + struct victim_sel_policy *p, unsigned int segno) +{ + struct sit_info *sit_i = SIT_I(sbi); + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start = GET_SEG_FROM_SEC(sbi, secno); + unsigned long long mtime = 0; + unsigned int i; + + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + if (p->gc_mode == GC_AT && + get_valid_blocks(sbi, segno, true) == 0) + return; + + if (p->alloc_mode == AT_SSR && + get_seg_entry(sbi, segno)->ckpt_valid_blocks == 0) + return; + } + + for (i = 0; i < sbi->segs_per_sec; i++) + mtime += get_seg_entry(sbi, start + i)->mtime; + mtime = div_u64(mtime, sbi->segs_per_sec); + + /* Handle if the system time has changed by the user */ + if (mtime < sit_i->min_mtime) + sit_i->min_mtime = mtime; + if (mtime > sit_i->max_mtime) + sit_i->max_mtime = mtime; + if (mtime < sit_i->dirty_min_mtime) + sit_i->dirty_min_mtime = mtime; + if (mtime > sit_i->dirty_max_mtime) + sit_i->dirty_max_mtime = mtime; + + /* don't choose young section as candidate */ + if (sit_i->dirty_max_mtime - mtime < p->age_threshold) + return; + + insert_victim_entry(sbi, mtime, segno); +} + +static struct rb_node *lookup_central_victim(struct f2fs_sb_info *sbi, + struct victim_sel_policy *p) +{ + struct atgc_management *am = &sbi->am; + struct rb_node *parent = NULL; + bool left_most; + + f2fs_lookup_rb_tree_ext(sbi, &am->root, &parent, p->age, &left_most); + + return parent; +} + +static void atgc_lookup_victim(struct f2fs_sb_info *sbi, + struct victim_sel_policy *p) +{ + struct sit_info *sit_i = SIT_I(sbi); + struct atgc_management *am = &sbi->am; + struct rb_root_cached *root = &am->root; + struct rb_node *node; + struct rb_entry *re; + struct victim_entry *ve; + unsigned long long total_time; + unsigned long long age, u, accu; + unsigned long long max_mtime = sit_i->dirty_max_mtime; + unsigned long long min_mtime = sit_i->dirty_min_mtime; + unsigned int sec_blocks = BLKS_PER_SEC(sbi); + unsigned int vblocks; + unsigned int dirty_threshold = max(am->max_candidate_count, + am->candidate_ratio * + am->victim_count / 100); + unsigned int age_weight = am->age_weight; + unsigned int cost; + unsigned int iter = 0; + + if (max_mtime < min_mtime) + return; + + max_mtime += 1; + total_time = max_mtime - min_mtime; + + accu = div64_u64(ULLONG_MAX, total_time); + accu = min_t(unsigned long long, div_u64(accu, 100), + DEFAULT_ACCURACY_CLASS); + + node = rb_first_cached(root); +next: + re = rb_entry_safe(node, struct rb_entry, rb_node); + if (!re) + return; + + ve = (struct victim_entry *)re; + + if (ve->mtime >= max_mtime || ve->mtime < min_mtime) + goto skip; + + /* age = 10000 * x% * 60 */ + age = div64_u64(accu * (max_mtime - ve->mtime), total_time) * + age_weight; + + vblocks = get_valid_blocks(sbi, ve->segno, true); + f2fs_bug_on(sbi, !vblocks || vblocks == sec_blocks); + + /* u = 10000 * x% * 40 */ + u = div64_u64(accu * (sec_blocks - vblocks), sec_blocks) * + (100 - age_weight); + + f2fs_bug_on(sbi, age + u >= UINT_MAX); + + cost = UINT_MAX - (age + u); + iter++; + + if (cost < p->min_cost || + (cost == p->min_cost && age > p->oldest_age)) { + p->min_cost = cost; + p->oldest_age = age; + p->min_segno = ve->segno; + } +skip: + if (iter < dirty_threshold) { + node = rb_next(node); + goto next; + } +} + +/* + * select candidates around source section in range of + * [target - dirty_threshold, target + dirty_threshold] + */ +static void atssr_lookup_victim(struct f2fs_sb_info *sbi, + struct victim_sel_policy *p) +{ + struct sit_info *sit_i = SIT_I(sbi); + struct atgc_management *am = &sbi->am; + struct rb_node *node; + struct rb_entry *re; + struct victim_entry *ve; + unsigned long long age; + unsigned long long max_mtime = sit_i->dirty_max_mtime; + unsigned long long min_mtime = sit_i->dirty_min_mtime; + unsigned int seg_blocks = sbi->blocks_per_seg; + unsigned int vblocks; + unsigned int dirty_threshold = max(am->max_candidate_count, + am->candidate_ratio * + am->victim_count / 100); + unsigned int cost; + unsigned int iter = 0; + int stage = 0; + + if (max_mtime < min_mtime) + return; + max_mtime += 1; +next_stage: + node = lookup_central_victim(sbi, p); +next_node: + re = rb_entry_safe(node, struct rb_entry, rb_node); + if (!re) { + if (stage == 0) + goto skip_stage; + return; + } + + ve = (struct victim_entry *)re; + + if (ve->mtime >= max_mtime || ve->mtime < min_mtime) + goto skip_node; + + age = max_mtime - ve->mtime; + + vblocks = get_seg_entry(sbi, ve->segno)->ckpt_valid_blocks; + f2fs_bug_on(sbi, !vblocks); + + /* rare case */ + if (vblocks == seg_blocks) + goto skip_node; + + iter++; + + age = max_mtime - abs(p->age - age); + cost = UINT_MAX - vblocks; + + if (cost < p->min_cost || + (cost == p->min_cost && age > p->oldest_age)) { + p->min_cost = cost; + p->oldest_age = age; + p->min_segno = ve->segno; + } +skip_node: + if (iter < dirty_threshold) { + if (stage == 0) + node = rb_prev(node); + else if (stage == 1) + node = rb_next(node); + goto next_node; + } +skip_stage: + if (stage < 1) { + stage++; + iter = 0; + goto next_stage; + } +} +static void lookup_victim_by_age(struct f2fs_sb_info *sbi, + struct victim_sel_policy *p) +{ + f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, + &sbi->am.root, true)); + + if (p->gc_mode == GC_AT) + atgc_lookup_victim(sbi, p); + else if (p->alloc_mode == AT_SSR) + atssr_lookup_victim(sbi, p); + else + f2fs_bug_on(sbi, 1); +} + +static void release_victim_entry(struct f2fs_sb_info *sbi) +{ + struct atgc_management *am = &sbi->am; + struct victim_entry *ve, *tmp; + + list_for_each_entry_safe(ve, tmp, &am->victim_list, list) { + list_del(&ve->list); + kmem_cache_free(victim_entry_slab, ve); + am->victim_count--; + } + + am->root = RB_ROOT_CACHED; + + f2fs_bug_on(sbi, am->victim_count); + f2fs_bug_on(sbi, !list_empty(&am->victim_list)); +} + /* * This function is called from two paths. * One is garbage collection and the other is SSR segment selection. @@ -323,25 +620,37 @@ static unsigned int count_bits(const unsigned long *addr, * which has minimum valid blocks and removes it from dirty seglist. */ static int get_victim_by_default(struct f2fs_sb_info *sbi, - unsigned int *result, int gc_type, int type, char alloc_mode) + unsigned int *result, int gc_type, int type, + char alloc_mode, unsigned long long age) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct sit_info *sm = SIT_I(sbi); struct victim_sel_policy p; unsigned int secno, last_victim; unsigned int last_segment; - unsigned int nsearched = 0; + unsigned int nsearched; + bool is_atgc; int ret = 0; mutex_lock(&dirty_i->seglist_lock); last_segment = MAIN_SECS(sbi) * sbi->segs_per_sec; p.alloc_mode = alloc_mode; - select_policy(sbi, gc_type, type, &p); + p.age = age; + p.age_threshold = sbi->am.age_threshold; +retry: + select_policy(sbi, gc_type, type, &p); p.min_segno = NULL_SEGNO; + p.oldest_age = 0; p.min_cost = get_max_cost(sbi, &p); + is_atgc = (p.gc_mode == GC_AT || p.alloc_mode == AT_SSR); + nsearched = 0; + + if (is_atgc) + SIT_I(sbi)->dirty_min_mtime = ULLONG_MAX; + if (*result != NULL_SEGNO) { if (!get_valid_blocks(sbi, *result, false)) { ret = -ENODATA; @@ -422,11 +731,16 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, /* Don't touch checkpointed data */ if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) && get_ckpt_valid_blocks(sbi, segno) && - p.alloc_mode != SSR)) + p.alloc_mode == LFS)) goto next; if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) goto next; + if (is_atgc) { + add_victim_entry(sbi, &p, segno); + goto next; + } + cost = get_gc_cost(sbi, segno, &p); if (p.min_cost > cost) { @@ -445,6 +759,19 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, break; } } + + /* get victim for GC_AT/AT_SSR */ + if (is_atgc) { + lookup_victim_by_age(sbi, &p); + release_victim_entry(sbi); + } + + if (is_atgc && p.min_segno == NULL_SEGNO && + sm->elapsed_time < p.age_threshold) { + p.age_threshold = 0; + goto retry; + } + if (p.min_segno != NULL_SEGNO) { got_it: *result = (p.min_segno / p.ofs_unit) * p.ofs_unit; @@ -793,6 +1120,8 @@ static int move_data_block(struct inode *inode, block_t bidx, block_t newaddr; int err = 0; bool lfs_mode = f2fs_lfs_mode(fio.sbi); + int type = fio.sbi->am.atgc_enabled ? + CURSEG_ALL_DATA_ATGC : CURSEG_COLD_DATA; /* do not read out */ page = f2fs_grab_cache_page(inode->i_mapping, bidx, false); @@ -879,7 +1208,7 @@ static int move_data_block(struct inode *inode, block_t bidx, } f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, - &sum, CURSEG_COLD_DATA, NULL, true); + &sum, type, NULL); fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi), newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS); @@ -1185,7 +1514,7 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, down_write(&sit_i->sentry_lock); ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type, - NO_CHECK_TYPE, LFS); + NO_CHECK_TYPE, LFS, 0); up_write(&sit_i->sentry_lock); return ret; } @@ -1216,6 +1545,8 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, end_segno -= sbi->segs_per_sec - f2fs_usable_segs_in_sec(sbi, segno); + sanity_check_seg_type(sbi, get_seg_entry(sbi, segno)->type); + /* readahead multi ssa blocks those have contiguous address */ if (__is_large_section(sbi)) f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), @@ -1426,6 +1757,37 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, return ret; } +int __init f2fs_create_garbage_collection_cache(void) +{ + victim_entry_slab = f2fs_kmem_cache_create("f2fs_victim_entry", + sizeof(struct victim_entry)); + if (!victim_entry_slab) + return -ENOMEM; + return 0; +} + +void f2fs_destroy_garbage_collection_cache(void) +{ + kmem_cache_destroy(victim_entry_slab); +} + +static void init_atgc_management(struct f2fs_sb_info *sbi) +{ + struct atgc_management *am = &sbi->am; + + if (test_opt(sbi, ATGC) && + SIT_I(sbi)->elapsed_time >= DEF_GC_THREAD_AGE_THRESHOLD) + am->atgc_enabled = true; + + am->root = RB_ROOT_CACHED; + INIT_LIST_HEAD(&am->victim_list); + am->victim_count = 0; + + am->candidate_ratio = DEF_GC_THREAD_CANDIDATE_RATIO; + am->max_candidate_count = DEF_GC_THREAD_MAX_CANDIDATE_COUNT; + am->age_weight = DEF_GC_THREAD_AGE_WEIGHT; +} + void f2fs_build_gc_manager(struct f2fs_sb_info *sbi) { DIRTY_I(sbi)->v_ops = &default_v_ops; @@ -1436,6 +1798,8 @@ void f2fs_build_gc_manager(struct f2fs_sb_info *sbi) if (f2fs_is_multi_device(sbi) && !__is_large_section(sbi)) SIT_I(sbi)->last_victim[ALLOC_NEXT] = GET_SEGNO(sbi, FDEV(0).end_blk) + 1; + + init_atgc_management(sbi); } static int free_segment_range(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index ee5d7f30a1f8..0c8dae12dc51 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -14,6 +14,14 @@ #define DEF_GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */ #define DEF_GC_THREAD_MAX_SLEEP_TIME 60000 #define DEF_GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */ + +/* choose candidates from sections which has age of more than 7 days */ +#define DEF_GC_THREAD_AGE_THRESHOLD (60 * 60 * 24 * 7) +#define DEF_GC_THREAD_CANDIDATE_RATIO 20 /* select 20% oldest sections as candidates */ +#define DEF_GC_THREAD_MAX_CANDIDATE_COUNT 10 /* select at most 10 sections as candidates */ +#define DEF_GC_THREAD_AGE_WEIGHT 60 /* age weight */ +#define DEFAULT_ACCURACY_CLASS 10000 /* accuracy class */ + #define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */ #define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */ @@ -41,6 +49,23 @@ struct gc_inode_list { struct radix_tree_root iroot; }; +struct victim_info { + unsigned long long mtime; /* mtime of section */ + unsigned int segno; /* section No. */ +}; + +struct victim_entry { + struct rb_node rb_node; /* rb node located in rb-tree */ + union { + struct { + unsigned long long mtime; /* mtime of section */ + unsigned int segno; /* segment No. */ + }; + struct victim_info vi; /* victim info */ + }; + struct list_head list; +}; + /* * inline functions */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 16f2376d9b00..5a9928c1ffb5 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2423,9 +2423,9 @@ static void write_current_sum_page(struct f2fs_sb_info *sbi, f2fs_put_page(page, 1); } -static int is_next_segment_free(struct f2fs_sb_info *sbi, int type) +static int is_next_segment_free(struct f2fs_sb_info *sbi, + struct curseg_info *curseg, int type) { - struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int segno = curseg->segno + 1; struct free_segmap_info *free_i = FREE_I(sbi); @@ -2529,6 +2529,7 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) { struct curseg_info *curseg = CURSEG_I(sbi, type); struct summary_footer *sum_footer; + unsigned short seg_type = curseg->seg_type; curseg->inited = true; curseg->segno = curseg->next_segno; @@ -2538,16 +2539,22 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) sum_footer = &(curseg->sum_blk->footer); memset(sum_footer, 0, sizeof(struct summary_footer)); - if (IS_DATASEG(curseg->seg_type)) + + sanity_check_seg_type(sbi, seg_type); + + if (IS_DATASEG(seg_type)) SET_SUM_TYPE(sum_footer, SUM_TYPE_DATA); - if (IS_NODESEG(curseg->seg_type)) + if (IS_NODESEG(seg_type)) SET_SUM_TYPE(sum_footer, SUM_TYPE_NODE); - __set_sit_entry_type(sbi, curseg->seg_type, curseg->segno, modified); + __set_sit_entry_type(sbi, seg_type, curseg->segno, modified); } static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned short seg_type = curseg->seg_type; + + sanity_check_seg_type(sbi, seg_type); /* if segs_per_sec is large than 1, we need to keep original policy. */ if (__is_large_section(sbi)) @@ -2561,8 +2568,7 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) return 0; if (test_opt(sbi, NOHEAP) && - (curseg->seg_type == CURSEG_HOT_DATA || - IS_NODESEG(curseg->seg_type))) + (seg_type == CURSEG_HOT_DATA || IS_NODESEG(seg_type))) return 0; if (SIT_I(sbi)->last_victim[ALLOC_NEXT]) @@ -2638,7 +2644,7 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, * This function always allocates a used segment(from dirty seglist) by SSR * manner, so it should recover the existing segment information of valid blocks */ -static void change_curseg(struct f2fs_sb_info *sbi, int type) +static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -2646,8 +2652,10 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type) struct f2fs_summary_block *sum_node; struct page *sum_page; - write_sum_page(sbi, curseg->sum_blk, - GET_SUM_BLOCK(sbi, curseg->segno)); + if (flush) + write_sum_page(sbi, curseg->sum_blk, + GET_SUM_BLOCK(sbi, curseg->segno)); + __set_test_and_inuse(sbi, new_segno); mutex_lock(&dirty_i->seglist_lock); @@ -2666,7 +2674,56 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type) f2fs_put_page(sum_page, 1); } -void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi, int type) +static int get_ssr_segment(struct f2fs_sb_info *sbi, int type, + int alloc_mode, unsigned long long age); + +static void get_atssr_segment(struct f2fs_sb_info *sbi, int type, + int target_type, int alloc_mode, + unsigned long long age) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + + curseg->seg_type = target_type; + + if (get_ssr_segment(sbi, type, alloc_mode, age)) { + struct seg_entry *se = get_seg_entry(sbi, curseg->next_segno); + + curseg->seg_type = se->type; + change_curseg(sbi, type, true); + } else { + /* allocate cold segment by default */ + curseg->seg_type = CURSEG_COLD_DATA; + new_curseg(sbi, type, true); + } + stat_inc_seg_type(sbi, curseg); +} + +static void __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi) +{ + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC); + + if (!sbi->am.atgc_enabled) + return; + + down_read(&SM_I(sbi)->curseg_lock); + + mutex_lock(&curseg->curseg_mutex); + down_write(&SIT_I(sbi)->sentry_lock); + + get_atssr_segment(sbi, CURSEG_ALL_DATA_ATGC, CURSEG_COLD_DATA, SSR, 0); + + up_write(&SIT_I(sbi)->sentry_lock); + mutex_unlock(&curseg->curseg_mutex); + + up_read(&SM_I(sbi)->curseg_lock); + +} +void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi) +{ + __f2fs_init_atgc_curseg(sbi); +} + +static void __f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -2686,7 +2743,15 @@ void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi, int type) mutex_unlock(&curseg->curseg_mutex); } -void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi, int type) +void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi) +{ + __f2fs_save_inmem_curseg(sbi, CURSEG_COLD_DATA_PINNED); + + if (sbi->am.atgc_enabled) + __f2fs_save_inmem_curseg(sbi, CURSEG_ALL_DATA_ATGC); +} + +static void __f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -2703,23 +2768,35 @@ void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi, int type) mutex_unlock(&curseg->curseg_mutex); } -static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) +void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi) +{ + __f2fs_restore_inmem_curseg(sbi, CURSEG_COLD_DATA_PINNED); + + if (sbi->am.atgc_enabled) + __f2fs_restore_inmem_curseg(sbi, CURSEG_ALL_DATA_ATGC); +} + +static int get_ssr_segment(struct f2fs_sb_info *sbi, int type, + int alloc_mode, unsigned long long age) { struct curseg_info *curseg = CURSEG_I(sbi, type); const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; unsigned segno = NULL_SEGNO; + unsigned short seg_type = curseg->seg_type; int i, cnt; bool reversed = false; + sanity_check_seg_type(sbi, seg_type); + /* f2fs_need_SSR() already forces to do this */ - if (!v_ops->get_victim(sbi, &segno, BG_GC, type, SSR)) { + if (!v_ops->get_victim(sbi, &segno, BG_GC, seg_type, alloc_mode, age)) { curseg->next_segno = segno; return 1; } /* For node segments, let's do SSR more intensively */ - if (IS_NODESEG(type)) { - if (type >= CURSEG_WARM_NODE) { + if (IS_NODESEG(seg_type)) { + if (seg_type >= CURSEG_WARM_NODE) { reversed = true; i = CURSEG_COLD_NODE; } else { @@ -2727,7 +2804,7 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) } cnt = NR_CURSEG_NODE_TYPE; } else { - if (type >= CURSEG_WARM_DATA) { + if (seg_type >= CURSEG_WARM_DATA) { reversed = true; i = CURSEG_COLD_DATA; } else { @@ -2737,9 +2814,9 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) } for (; cnt-- > 0; reversed ? i-- : i++) { - if (i == type) + if (i == seg_type) continue; - if (!v_ops->get_victim(sbi, &segno, BG_GC, i, SSR)) { + if (!v_ops->get_victim(sbi, &segno, BG_GC, i, alloc_mode, age)) { curseg->next_segno = segno; return 1; } @@ -2768,13 +2845,15 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, if (force) new_curseg(sbi, type, true); else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && - type == CURSEG_WARM_NODE) + curseg->seg_type == CURSEG_WARM_NODE) new_curseg(sbi, type, false); - else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type) && + else if (curseg->alloc_type == LFS && + is_next_segment_free(sbi, curseg, type) && likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) new_curseg(sbi, type, false); - else if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type)) - change_curseg(sbi, type); + else if (f2fs_need_SSR(sbi) && + get_ssr_segment(sbi, type, SSR, 0)) + change_curseg(sbi, type, true); else new_curseg(sbi, type, false); @@ -2795,8 +2874,8 @@ void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, if (segno < start || segno > end) goto unlock; - if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type)) - change_curseg(sbi, type); + if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type, SSR, 0)) + change_curseg(sbi, type, true); else new_curseg(sbi, type, true); @@ -3015,10 +3094,9 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) return err; } -static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) +static bool __has_curseg_space(struct f2fs_sb_info *sbi, + struct curseg_info *curseg) { - struct curseg_info *curseg = CURSEG_I(sbi, type); - return curseg->next_blkoff < f2fs_usable_blks_in_seg(sbi, curseg->segno); } @@ -3160,8 +3238,13 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) if (fio->type == DATA) { struct inode *inode = fio->page->mapping->host; - if (is_cold_data(fio->page) || file_is_cold(inode) || - f2fs_compressed_file(inode)) + if (is_cold_data(fio->page)) { + if (fio->sbi->am.atgc_enabled) + return CURSEG_ALL_DATA_ATGC; + else + return CURSEG_COLD_DATA; + } + if (file_is_cold(inode) || f2fs_compressed_file(inode)) return CURSEG_COLD_DATA; if (file_is_hot(inode) || is_inode_flag_set(inode, FI_HOT_DATA) || @@ -3207,19 +3290,29 @@ static int __get_segment_type(struct f2fs_io_info *fio) void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, - struct f2fs_io_info *fio, bool from_gc) + struct f2fs_io_info *fio) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned long long old_mtime; + bool from_gc = (type == CURSEG_ALL_DATA_ATGC); + struct seg_entry *se = NULL; down_read(&SM_I(sbi)->curseg_lock); mutex_lock(&curseg->curseg_mutex); down_write(&sit_i->sentry_lock); + if (from_gc) { + f2fs_bug_on(sbi, GET_SEGNO(sbi, old_blkaddr) == NULL_SEGNO); + se = get_seg_entry(sbi, GET_SEGNO(sbi, old_blkaddr)); + sanity_check_seg_type(sbi, se->type); + f2fs_bug_on(sbi, IS_NODESEG(se->type)); + } *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + f2fs_bug_on(sbi, curseg->next_blkoff >= sbi->blocks_per_seg); + f2fs_wait_discard_bio(sbi, *new_blkaddr); /* @@ -3249,9 +3342,13 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) update_sit_entry(sbi, old_blkaddr, -1); - if (!__has_curseg_space(sbi, type)) - sit_i->s_ops->allocate_segment(sbi, type, false); - + if (!__has_curseg_space(sbi, curseg)) { + if (from_gc) + get_atssr_segment(sbi, type, se->type, + AT_SSR, se->mtime); + else + sit_i->s_ops->allocate_segment(sbi, type, false); + } /* * segment dirty status should be updated after segment allocation, * so we just need to update status only one time after previous @@ -3317,8 +3414,7 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) down_read(&fio->sbi->io_order_lock); reallocate: f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, - &fio->new_blkaddr, sum, type, fio, - is_cold_data(fio->page)); + &fio->new_blkaddr, sum, type, fio); if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) invalidate_mapping_pages(META_MAPPING(fio->sbi), fio->old_blkaddr, fio->old_blkaddr); @@ -3480,7 +3576,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, /* change the current segment */ if (segno != curseg->segno) { curseg->next_segno = segno; - change_curseg(sbi, type); + change_curseg(sbi, type, true); } curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); @@ -3507,7 +3603,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (recover_curseg) { if (old_cursegno != curseg->segno) { curseg->next_segno = old_cursegno; - change_curseg(sbi, type); + change_curseg(sbi, type, true); } curseg->next_blkoff = old_blkoff; } @@ -4262,6 +4358,8 @@ static int build_curseg(struct f2fs_sb_info *sbi) array[i].seg_type = CURSEG_HOT_DATA + i; else if (i == CURSEG_COLD_DATA_PINNED) array[i].seg_type = CURSEG_COLD_DATA; + else if (i == CURSEG_ALL_DATA_ATGC) + array[i].seg_type = CURSEG_COLD_DATA; array[i].segno = NULL_SEGNO; array[i].next_blkoff = 0; array[i].inited = false; @@ -4508,6 +4606,8 @@ static int sanity_check_curseg(struct f2fs_sb_info *sbi) struct seg_entry *se = get_seg_entry(sbi, curseg->segno); unsigned int blkofs = curseg->next_blkoff; + sanity_check_seg_type(sbi, curseg->seg_type); + if (f2fs_test_bit(blkofs, se->cur_valid_map)) goto out; @@ -4583,6 +4683,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi) sit_i->min_mtime = mtime; } sit_i->max_mtime = get_mtime(sbi, false); + sit_i->dirty_max_mtime = 0; up_write(&sit_i->sentry_lock); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index ddaffe30f992..47b888ad913b 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -24,6 +24,12 @@ #define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA) #define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE && (t) <= CURSEG_COLD_NODE) +static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, + unsigned short seg_type) +{ + f2fs_bug_on(sbi, seg_type >= NR_PERSISTENT_LOG); +} + #define IS_HOT(t) ((t) == CURSEG_HOT_NODE || (t) == CURSEG_HOT_DATA) #define IS_WARM(t) ((t) == CURSEG_WARM_NODE || (t) == CURSEG_WARM_DATA) #define IS_COLD(t) ((t) == CURSEG_COLD_NODE || (t) == CURSEG_COLD_DATA) @@ -35,7 +41,8 @@ ((seg) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno) || \ - ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno)) + ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC)->segno)) #define IS_CURSEC(sbi, secno) \ (((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ @@ -51,6 +58,8 @@ ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ (sbi)->segs_per_sec) || \ ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC)->segno / \ (sbi)->segs_per_sec)) #define MAIN_BLKADDR(sbi) \ @@ -135,20 +144,25 @@ enum { * In the victim_sel_policy->alloc_mode, there are two block allocation modes. * LFS writes data sequentially with cleaning operations. * SSR (Slack Space Recycle) reuses obsolete space without cleaning operations. + * AT_SSR (Age Threshold based Slack Space Recycle) merges fragments into + * fragmented segment which has similar aging degree. */ enum { LFS = 0, - SSR + SSR, + AT_SSR, }; /* * In the victim_sel_policy->gc_mode, there are two gc, aka cleaning, modes. * GC_CB is based on cost-benefit algorithm. * GC_GREEDY is based on greedy algorithm. + * GC_AT is based on age-threshold algorithm. */ enum { GC_CB = 0, GC_GREEDY, + GC_AT, ALLOC_NEXT, FLUSH_DEVICE, MAX_GC_POLICY, @@ -177,7 +191,10 @@ struct victim_sel_policy { unsigned int offset; /* last scanned bitmap offset */ unsigned int ofs_unit; /* bitmap search unit */ unsigned int min_cost; /* minimum cost */ + unsigned long long oldest_age; /* oldest age of segments having the same min cost */ unsigned int min_segno; /* segment # having min. cost */ + unsigned long long age; /* mtime of GCed section*/ + unsigned long long age_threshold;/* age threshold */ }; struct seg_entry { @@ -243,6 +260,8 @@ struct sit_info { unsigned long long mounted_time; /* mount time */ unsigned long long min_mtime; /* min. modification time */ unsigned long long max_mtime; /* max. modification time */ + unsigned long long dirty_min_mtime; /* rerange candidates in GC_AT */ + unsigned long long dirty_max_mtime; /* rerange candidates in GC_AT */ unsigned int last_victim[MAX_GC_POLICY]; /* last victim segment # */ }; @@ -281,7 +300,7 @@ struct dirty_seglist_info { /* victim selection function for cleaning and SSR */ struct victim_selection { int (*get_victim)(struct f2fs_sb_info *, unsigned int *, - int, int, char); + int, int, char, unsigned long long); }; /* for active log information */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b5826f674a9a..23259de94a67 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -144,6 +144,7 @@ enum { Opt_compress_algorithm, Opt_compress_log_size, Opt_compress_extension, + Opt_atgc, Opt_err, }; @@ -210,6 +211,7 @@ static match_table_t f2fs_tokens = { {Opt_compress_algorithm, "compress_algorithm=%s"}, {Opt_compress_log_size, "compress_log_size=%u"}, {Opt_compress_extension, "compress_extension=%s"}, + {Opt_atgc, "atgc"}, {Opt_err, NULL}, }; @@ -926,6 +928,9 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) f2fs_info(sbi, "compression options not supported"); break; #endif + case Opt_atgc: + set_opt(sbi, ATGC); + break; default: f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value", p); @@ -1624,6 +1629,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) #ifdef CONFIG_F2FS_FS_COMPRESSION f2fs_show_compress_options(seq, sbi->sb); #endif + + if (test_opt(sbi, ATGC)) + seq_puts(seq, ",atgc"); return 0; } @@ -1751,6 +1759,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); bool disable_checkpoint = test_opt(sbi, DISABLE_CHECKPOINT); bool no_io_align = !F2FS_IO_ALIGNED(sbi); + bool no_atgc = !test_opt(sbi, ATGC); bool checkpoint_changed; #ifdef CONFIG_QUOTA int i, j; @@ -1823,6 +1832,13 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } } #endif + /* disallow enable atgc dynamically */ + if (no_atgc == !!test_opt(sbi, ATGC)) { + err = -EINVAL; + f2fs_warn(sbi, "switch atgc option is not allowed"); + goto restore_opts; + } + /* disallow enable/disable extent_cache dynamically */ if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) { err = -EINVAL; @@ -3739,6 +3755,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) } } reset_checkpoint: + f2fs_init_inmem_curseg(sbi); + /* f2fs_recover_fsync_data() cleared this already */ clear_sbi_flag(sbi, SBI_POR_DOING); @@ -3944,9 +3962,12 @@ static int __init init_f2fs_fs(void) err = f2fs_create_extent_cache(); if (err) goto free_checkpoint_caches; - err = f2fs_init_sysfs(); + err = f2fs_create_garbage_collection_cache(); if (err) goto free_extent_cache; + err = f2fs_init_sysfs(); + if (err) + goto free_garbage_collection_cache; err = register_shrinker(&f2fs_shrinker_info); if (err) goto free_sysfs; @@ -3980,6 +4001,8 @@ static int __init init_f2fs_fs(void) unregister_shrinker(&f2fs_shrinker_info); free_sysfs: f2fs_exit_sysfs(); +free_garbage_collection_cache: + f2fs_destroy_garbage_collection_cache(); free_extent_cache: f2fs_destroy_extent_cache(); free_checkpoint_caches: @@ -4004,6 +4027,7 @@ static void __exit exit_f2fs_fs(void) unregister_filesystem(&f2fs_fs_type); unregister_shrinker(&f2fs_shrinker_info); f2fs_exit_sysfs(); + f2fs_destroy_garbage_collection_cache(); f2fs_destroy_extent_cache(); f2fs_destroy_checkpoint_caches(); f2fs_destroy_segment_manager_caches(); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 556a97be5b82..93b2a91a7b37 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -377,12 +377,17 @@ static ssize_t __sbi_store(struct f2fs_attr *a, return count; } if (!strcmp(a->attr.name, "gc_idle")) { - if (t == GC_IDLE_CB) + if (t == GC_IDLE_CB) { sbi->gc_mode = GC_IDLE_CB; - else if (t == GC_IDLE_GREEDY) + } else if (t == GC_IDLE_GREEDY) { sbi->gc_mode = GC_IDLE_GREEDY; - else + } else if (t == GC_IDLE_AT) { + if (!sbi->am.atgc_enabled) + return -EINVAL; + sbi->gc_mode = GC_AT; + } else { sbi->gc_mode = GC_NORMAL; + } return count; } diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 9970c25942b5..ca01996fca66 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -121,13 +121,15 @@ TRACE_DEFINE_ENUM(CP_RESIZE); #define show_alloc_mode(type) \ __print_symbolic(type, \ - { LFS, "LFS-mode" }, \ - { SSR, "SSR-mode" }) + { LFS, "LFS-mode" }, \ + { SSR, "SSR-mode" }, \ + { AT_SSR, "AT_SSR-mode" }) #define show_victim_policy(type) \ __print_symbolic(type, \ { GC_GREEDY, "Greedy" }, \ - { GC_CB, "Cost-Benefit" }) + { GC_CB, "Cost-Benefit" }, \ + { GC_AT, "Age-threshold" }) #define show_cpreason(type) \ __print_flags(type, "|", \ From 970c21a8a1aa002cde93cad8a95dcf3803071b69 Mon Sep 17 00:00:00 2001 From: Jack Qiu Date: Mon, 31 Aug 2020 09:58:02 +0800 Subject: [PATCH 069/580] f2fs: correct statistic of APP_DIRECT_IO/APP_DIRECT_READ_IO Miss to update APP_DIRECT_IO/APP_DIRECT_READ_IO when receiving async DIO. For example: fio -filename=/data/test.0 -bs=1m -ioengine=libaio -direct=1 -name=fill -size=10m -numjobs=1 -iodepth=32 -rw=write Signed-off-by: Jack Qiu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 2a04068783c6..742b008a9b01 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3627,12 +3627,18 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) err); if (!do_opu) set_inode_flag(inode, FI_UPDATE_WRITE); + } else if (err == -EIOCBQUEUED) { + f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO, + count - iov_iter_count(iter)); } else if (err < 0) { f2fs_write_failed(mapping, offset + count); } } else { if (err > 0) f2fs_update_iostat(sbi, APP_DIRECT_READ_IO, err); + else if (err == -EIOCBQUEUED) + f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_READ_IO, + count - iov_iter_count(iter)); } out: From eec17a730e63ed1964efac1a53fd2c087870c202 Mon Sep 17 00:00:00 2001 From: Dan Robertson Date: Sun, 30 Aug 2020 21:45:23 +0000 Subject: [PATCH 070/580] f2fs: check position in move range ioctl When the move range ioctl is used, check the input and output position and ensure that it is a non-negative value. Without this check f2fs_get_dnode_of_data may hit a memmory bug. Signed-off-by: Dan Robertson Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 36a681fe1d80..4aca4f2a4110 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2790,6 +2790,9 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, if (IS_ENCRYPTED(src) || IS_ENCRYPTED(dst)) return -EOPNOTSUPP; + if (pos_out < 0 || pos_in < 0) + return -EINVAL; + if (src == dst) { if (pos_in == pos_out) return 0; From e38657f394c616b1cf2408e72d48436a40e98149 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Mon, 31 Aug 2020 09:24:01 +0900 Subject: [PATCH 071/580] f2fs: add block address limit check to compressed file Need to add block address range check to compressed file case and avoid calling get_data_block_bmap() for compressed file. Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 742b008a9b01..8813b2b8a1bb 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1754,10 +1754,6 @@ static int get_data_block_dio(struct inode *inode, sector_t iblock, static int get_data_block_bmap(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - /* Block number less than F2FS MAX BLOCKS */ - if (unlikely(iblock >= F2FS_I_SB(inode)->max_file_blocks)) - return -EFBIG; - return __get_data_block(inode, iblock, bh_result, create, F2FS_GET_BLOCK_BMAP, NULL, NO_CHECK_TYPE, create); @@ -3767,11 +3763,16 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) filemap_write_and_wait(mapping); - if (f2fs_compressed_file(inode)) - blknr = f2fs_bmap_compress(inode, block); + /* Block number less than F2FS MAX BLOCKS */ + if (unlikely(block >= F2FS_I_SB(inode)->max_file_blocks)) + goto out; - if (!get_data_block_bmap(inode, block, &tmp, 0)) - blknr = tmp.b_blocknr; + if (f2fs_compressed_file(inode)) { + blknr = f2fs_bmap_compress(inode, block); + } else { + if (!get_data_block_bmap(inode, block, &tmp, 0)) + blknr = tmp.b_blocknr; + } out: trace_f2fs_bmap(inode, block, blknr); return blknr; From 70c00aac6f97f1fe0639baf6ff59a025c99c7717 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Mon, 31 Aug 2020 11:09:49 +0900 Subject: [PATCH 072/580] f2fs: change compr_blocks of superblock info to 64bit Current compr_blocks of superblock info is not 64bit value. We are accumulating each i_compr_blocks count of inodes to this value and those are 64bit values. So, need to change this to 64bit value. Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 6 +++--- fs/f2fs/f2fs.h | 9 +++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index cb679561f44d..f3cc34db86d7 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -131,7 +131,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->inline_inode = atomic_read(&sbi->inline_inode); si->inline_dir = atomic_read(&sbi->inline_dir); si->compr_inode = atomic_read(&sbi->compr_inode); - si->compr_blocks = atomic_read(&sbi->compr_blocks); + si->compr_blocks = atomic64_read(&sbi->compr_blocks); si->append = sbi->im[APPEND_INO].ino_num; si->update = sbi->im[UPDATE_INO].ino_num; si->orphans = sbi->im[ORPHAN_INO].ino_num; @@ -342,7 +342,7 @@ static int stat_show(struct seq_file *s, void *v) si->inline_inode); seq_printf(s, " - Inline_dentry Inode: %u\n", si->inline_dir); - seq_printf(s, " - Compressed Inode: %u, Blocks: %u\n", + seq_printf(s, " - Compressed Inode: %u, Blocks: %llu\n", si->compr_inode, si->compr_blocks); seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n", si->orphans, si->append, si->update); @@ -550,7 +550,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->inline_inode, 0); atomic_set(&sbi->inline_dir, 0); atomic_set(&sbi->compr_inode, 0); - atomic_set(&sbi->compr_blocks, 0); + atomic64_set(&sbi->compr_blocks, 0); atomic_set(&sbi->inplace_count, 0); for (i = META_CP; i < META_MAX; i++) atomic_set(&sbi->meta_count[i], 0); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2dcfea53c68f..12ce101d6bf7 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1570,7 +1570,7 @@ struct f2fs_sb_info { atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ atomic_t compr_inode; /* # of compressed inodes */ - atomic_t compr_blocks; /* # of compressed blocks */ + atomic64_t compr_blocks; /* # of compressed blocks */ atomic_t vw_cnt; /* # of volatile writes */ atomic_t max_aw_cnt; /* max # of atomic writes */ atomic_t max_vw_cnt; /* max # of volatile writes */ @@ -3568,7 +3568,8 @@ struct f2fs_stat_info { int nr_discard_cmd; unsigned int undiscard_blks; int inline_xattr, inline_inode, inline_dir, append, update, orphans; - int compr_inode, compr_blocks; + int compr_inode; + unsigned long long compr_blocks; int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; @@ -3653,9 +3654,9 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) (atomic_dec(&F2FS_I_SB(inode)->compr_inode)); \ } while (0) #define stat_add_compr_blocks(inode, blocks) \ - (atomic_add(blocks, &F2FS_I_SB(inode)->compr_blocks)) + (atomic64_add(blocks, &F2FS_I_SB(inode)->compr_blocks)) #define stat_sub_compr_blocks(inode, blocks) \ - (atomic_sub(blocks, &F2FS_I_SB(inode)->compr_blocks)) + (atomic64_sub(blocks, &F2FS_I_SB(inode)->compr_blocks)) #define stat_inc_meta_count(sbi, blkaddr) \ do { \ if (blkaddr < SIT_I(sbi)->sit_base_addr) \ From 471f5d75533e973038e5ccee9c4b625050ac5339 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 2 Sep 2020 15:01:52 +0800 Subject: [PATCH 073/580] f2fs: allocate proper size memory for zstd decompress As 5kft <5kft@5kft.org> reported: kworker/u9:3: page allocation failure: order:9, mode:0x40c40(GFP_NOFS|__GFP_COMP), nodemask=(null),cpuset=/,mems_allowed=0 CPU: 3 PID: 8168 Comm: kworker/u9:3 Tainted: G C 5.8.3-sunxi #trunk Hardware name: Allwinner sun8i Family Workqueue: f2fs_post_read_wq f2fs_post_read_work [] (unwind_backtrace) from [] (show_stack+0x11/0x14) [] (show_stack) from [] (dump_stack+0x75/0x84) [] (dump_stack) from [] (warn_alloc+0xa3/0x104) [] (warn_alloc) from [] (__alloc_pages_nodemask+0xb87/0xc40) [] (__alloc_pages_nodemask) from [] (kmalloc_order+0x19/0x38) [] (kmalloc_order) from [] (kmalloc_order_trace+0x19/0x90) [] (kmalloc_order_trace) from [] (zstd_init_decompress_ctx+0x21/0x88) [] (zstd_init_decompress_ctx) from [] (f2fs_decompress_pages+0x97/0x228) [] (f2fs_decompress_pages) from [] (__read_end_io+0xfb/0x130) [] (__read_end_io) from [] (f2fs_post_read_work+0x61/0x84) [] (f2fs_post_read_work) from [] (process_one_work+0x15f/0x3b0) [] (process_one_work) from [] (worker_thread+0xfb/0x3e0) [] (worker_thread) from [] (kthread+0xeb/0x10c) [] (kthread) from [] zstd may allocate large size memory for {,de}compression, it may cause file copy failure on low-end device which has very few memory. For decompression, let's just allocate proper size memory based on current file's cluster size instead of max cluster size. Reported-by: 5kft <5kft@5kft.org> Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 7 ++++--- fs/f2fs/f2fs.h | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 68cfdff85fee..eaee9c99ceb2 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -383,16 +383,17 @@ static int zstd_init_decompress_ctx(struct decompress_io_ctx *dic) ZSTD_DStream *stream; void *workspace; unsigned int workspace_size; + unsigned int max_window_size = + MAX_COMPRESS_WINDOW_SIZE(dic->log_cluster_size); - workspace_size = ZSTD_DStreamWorkspaceBound(MAX_COMPRESS_WINDOW_SIZE); + workspace_size = ZSTD_DStreamWorkspaceBound(max_window_size); workspace = f2fs_kvmalloc(F2FS_I_SB(dic->inode), workspace_size, GFP_NOFS); if (!workspace) return -ENOMEM; - stream = ZSTD_initDStream(MAX_COMPRESS_WINDOW_SIZE, - workspace, workspace_size); + stream = ZSTD_initDStream(max_window_size, workspace, workspace_size); if (!stream) { printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_initDStream failed\n", KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 12ce101d6bf7..843537f7cc12 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1416,7 +1416,7 @@ struct decompress_io_ctx { #define NULL_CLUSTER ((unsigned int)(~0)) #define MIN_COMPRESS_LOG_SIZE 2 #define MAX_COMPRESS_LOG_SIZE 8 -#define MAX_COMPRESS_WINDOW_SIZE ((PAGE_SIZE) << MAX_COMPRESS_LOG_SIZE) +#define MAX_COMPRESS_WINDOW_SIZE(log_size) ((PAGE_SIZE) << (log_size)) struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ From e9ed8ba8ad056f93dac03fa26f06b518d49e3f38 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 3 Sep 2020 10:14:41 +0800 Subject: [PATCH 074/580] f2fs: ignore compress mount option on image w/o compression feature to keep consistent with behavior when passing compress mount option to kernel w/o compression feature, so that mount may not fail on such condition. Reported-by: Kyungmin Park Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 23259de94a67..5ab3c8ab9885 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -861,8 +861,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) #ifdef CONFIG_F2FS_FS_COMPRESSION case Opt_compress_algorithm: if (!f2fs_sb_has_compression(sbi)) { - f2fs_err(sbi, "Compression feature if off"); - return -EINVAL; + f2fs_info(sbi, "Image doesn't support compression"); + break; } name = match_strdup(&args[0]); if (!name) @@ -884,8 +884,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) break; case Opt_compress_log_size: if (!f2fs_sb_has_compression(sbi)) { - f2fs_err(sbi, "Compression feature is off"); - return -EINVAL; + f2fs_info(sbi, "Image doesn't support compression"); + break; } if (args->from && match_int(args, &arg)) return -EINVAL; @@ -899,8 +899,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) break; case Opt_compress_extension: if (!f2fs_sb_has_compression(sbi)) { - f2fs_err(sbi, "Compression feature is off"); - return -EINVAL; + f2fs_info(sbi, "Image doesn't support compression"); + break; } name = match_strdup(&args[0]); if (!name) From bf69a4dc6573aa4a74fe7f158745acacb818ddd3 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Tue, 8 Sep 2020 11:44:10 +0900 Subject: [PATCH 075/580] f2fs: change i_compr_blocks of inode to atomic value writepages() can be concurrently invoked for the same file by different threads such as a thread fsyncing the file and a kworker kernel thread. So, changing i_compr_blocks without protection is racy and we need to protect it by changing it with atomic type value. Plus, we don't need a 64bit value for i_compr_blocks, so just we will use a atomic value, not atomic64. Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 17 ++++++++++------- fs/f2fs/file.c | 22 ++++++++++++---------- fs/f2fs/inode.c | 11 +++++++---- fs/f2fs/super.c | 1 + 4 files changed, 30 insertions(+), 21 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 843537f7cc12..63c38ffd4f51 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -806,7 +806,7 @@ struct f2fs_inode_info { struct timespec64 i_disk_time[4];/* inode disk times */ /* for file compress */ - u64 i_compr_blocks; /* # of compressed blocks */ + atomic_t i_compr_blocks; /* # of compressed blocks */ unsigned char i_compress_algorithm; /* algorithm type */ unsigned char i_log_cluster_size; /* log of cluster size */ unsigned int i_cluster_size; /* cluster size */ @@ -3969,17 +3969,19 @@ static inline void set_compress_context(struct inode *inode) f2fs_mark_inode_dirty_sync(inode, true); } -static inline u64 f2fs_disable_compressed_file(struct inode *inode) +static inline u32 f2fs_disable_compressed_file(struct inode *inode) { struct f2fs_inode_info *fi = F2FS_I(inode); + u32 i_compr_blocks; if (!f2fs_compressed_file(inode)) return 0; if (S_ISREG(inode->i_mode)) { if (get_dirty_pages(inode)) return 1; - if (fi->i_compr_blocks) - return fi->i_compr_blocks; + i_compr_blocks = atomic_read(&fi->i_compr_blocks); + if (i_compr_blocks) + return i_compr_blocks; } fi->i_flags &= ~F2FS_COMPR_FL; @@ -4096,16 +4098,17 @@ static inline void f2fs_i_compr_blocks_update(struct inode *inode, u64 blocks, bool add) { int diff = F2FS_I(inode)->i_cluster_size - blocks; + struct f2fs_inode_info *fi = F2FS_I(inode); /* don't update i_compr_blocks if saved blocks were released */ - if (!add && !F2FS_I(inode)->i_compr_blocks) + if (!add && !atomic_read(&fi->i_compr_blocks)) return; if (add) { - F2FS_I(inode)->i_compr_blocks += diff; + atomic_add(diff, &fi->i_compr_blocks); stat_add_compr_blocks(inode, diff); } else { - F2FS_I(inode)->i_compr_blocks -= diff; + atomic_sub(diff, &fi->i_compr_blocks); stat_sub_compr_blocks(inode, diff); } f2fs_mark_inode_dirty_sync(inode, true); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 4aca4f2a4110..3dbf2dee089e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -564,7 +564,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) bool compressed_cluster = false; int cluster_index = 0, valid_blocks = 0; int cluster_size = F2FS_I(dn->inode)->i_cluster_size; - bool released = !F2FS_I(dn->inode)->i_compr_blocks; + bool released = !atomic_read(&F2FS_I(dn->inode)->i_compr_blocks); if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode)) base = get_extra_isize(dn->inode); @@ -3446,7 +3446,7 @@ static int f2fs_get_compress_blocks(struct file *filp, unsigned long arg) if (!f2fs_compressed_file(inode)) return -EINVAL; - blocks = F2FS_I(inode)->i_compr_blocks; + blocks = atomic_read(&F2FS_I(inode)->i_compr_blocks); return put_user(blocks, (u64 __user *)arg); } @@ -3550,7 +3550,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, true); - if (!F2FS_I(inode)->i_compr_blocks) + if (!atomic_read(&F2FS_I(inode)->i_compr_blocks)) goto out; down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); @@ -3598,14 +3598,15 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) if (ret >= 0) { ret = put_user(released_blocks, (u64 __user *)arg); - } else if (released_blocks && F2FS_I(inode)->i_compr_blocks) { + } else if (released_blocks && + atomic_read(&F2FS_I(inode)->i_compr_blocks)) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: partial blocks were released i_ino=%lx " - "iblocks=%llu, released=%u, compr_blocks=%llu, " + "iblocks=%llu, released=%u, compr_blocks=%u, " "run fsck to fix.", __func__, inode->i_ino, (u64)inode->i_blocks, released_blocks, - F2FS_I(inode)->i_compr_blocks); + atomic_read(&F2FS_I(inode)->i_compr_blocks)); } return ret; @@ -3693,7 +3694,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) if (ret) return ret; - if (F2FS_I(inode)->i_compr_blocks) + if (atomic_read(&F2FS_I(inode)->i_compr_blocks)) goto out; f2fs_balance_fs(F2FS_I_SB(inode), true); @@ -3757,14 +3758,15 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) if (ret >= 0) { ret = put_user(reserved_blocks, (u64 __user *)arg); - } else if (reserved_blocks && F2FS_I(inode)->i_compr_blocks) { + } else if (reserved_blocks && + atomic_read(&F2FS_I(inode)->i_compr_blocks)) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: partial blocks were released i_ino=%lx " - "iblocks=%llu, reserved=%u, compr_blocks=%llu, " + "iblocks=%llu, reserved=%u, compr_blocks=%u, " "run fsck to fix.", __func__, inode->i_ino, (u64)inode->i_blocks, reserved_blocks, - F2FS_I(inode)->i_compr_blocks); + atomic_read(&F2FS_I(inode)->i_compr_blocks)); } return ret; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 1a44b43518c8..05347c944422 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -442,7 +442,8 @@ static int do_read_inode(struct inode *inode) (fi->i_flags & F2FS_COMPR_FL)) { if (F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_log_cluster_size)) { - fi->i_compr_blocks = le64_to_cpu(ri->i_compr_blocks); + atomic_set(&fi->i_compr_blocks, + le64_to_cpu(ri->i_compr_blocks)); fi->i_compress_algorithm = ri->i_compress_algorithm; fi->i_log_cluster_size = ri->i_log_cluster_size; fi->i_cluster_size = 1 << fi->i_log_cluster_size; @@ -460,7 +461,7 @@ static int do_read_inode(struct inode *inode) stat_inc_inline_inode(inode); stat_inc_inline_dir(inode); stat_inc_compr_inode(inode); - stat_add_compr_blocks(inode, F2FS_I(inode)->i_compr_blocks); + stat_add_compr_blocks(inode, atomic_read(&fi->i_compr_blocks)); return 0; } @@ -619,7 +620,8 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, i_log_cluster_size)) { ri->i_compr_blocks = - cpu_to_le64(F2FS_I(inode)->i_compr_blocks); + cpu_to_le64(atomic_read( + &F2FS_I(inode)->i_compr_blocks)); ri->i_compress_algorithm = F2FS_I(inode)->i_compress_algorithm; ri->i_log_cluster_size = @@ -768,7 +770,8 @@ void f2fs_evict_inode(struct inode *inode) stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); stat_dec_compr_inode(inode); - stat_sub_compr_blocks(inode, F2FS_I(inode)->i_compr_blocks); + stat_sub_compr_blocks(inode, + atomic_read(&F2FS_I(inode)->i_compr_blocks)); if (likely(!f2fs_cp_error(sbi) && !is_sbi_flag_set(sbi, SBI_CP_DISABLED))) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 5ab3c8ab9885..e70bd085f115 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1013,6 +1013,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) /* Initialize f2fs-specific inode info */ atomic_set(&fi->dirty_pages, 0); + atomic_set(&fi->i_compr_blocks, 0); init_rwsem(&fi->i_sem); spin_lock_init(&fi->i_size_lock); INIT_LIST_HEAD(&fi->dirty_list); From e44c6b967905534a7e102ad13741c85e59d760d6 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Tue, 8 Sep 2020 11:44:11 +0900 Subject: [PATCH 076/580] f2fs: change return value of f2fs_disable_compressed_file to bool The returned integer is not required anywhere. So we need to change the return value to bool type. Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 17 ++++++----------- fs/f2fs/file.c | 4 ++-- 3 files changed, 9 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8813b2b8a1bb..187224cd26f9 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3944,7 +3944,7 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, if (ret) return ret; - if (f2fs_disable_compressed_file(inode)) + if (!f2fs_disable_compressed_file(inode)) return -EINVAL; ret = check_swap_activate(sis, file, span); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 63c38ffd4f51..1ef35106c9ae 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3969,26 +3969,21 @@ static inline void set_compress_context(struct inode *inode) f2fs_mark_inode_dirty_sync(inode, true); } -static inline u32 f2fs_disable_compressed_file(struct inode *inode) +static inline bool f2fs_disable_compressed_file(struct inode *inode) { struct f2fs_inode_info *fi = F2FS_I(inode); - u32 i_compr_blocks; if (!f2fs_compressed_file(inode)) - return 0; - if (S_ISREG(inode->i_mode)) { - if (get_dirty_pages(inode)) - return 1; - i_compr_blocks = atomic_read(&fi->i_compr_blocks); - if (i_compr_blocks) - return i_compr_blocks; - } + return true; + if (S_ISREG(inode->i_mode) && + (get_dirty_pages(inode) || atomic_read(&fi->i_compr_blocks))) + return false; fi->i_flags &= ~F2FS_COMPR_FL; stat_dec_compr_inode(inode); clear_inode_flag(inode, FI_COMPRESSED_FILE); f2fs_mark_inode_dirty_sync(inode, true); - return 0; + return true; } #define F2FS_FEATURE_FUNCS(name, flagname) \ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 3dbf2dee089e..9aa198643db6 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1835,7 +1835,7 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) if ((iflags ^ masked_flags) & F2FS_COMPR_FL) { if (masked_flags & F2FS_COMPR_FL) { - if (f2fs_disable_compressed_file(inode)) + if (!f2fs_disable_compressed_file(inode)) return -EINVAL; } if (iflags & F2FS_NOCOMP_FL) @@ -3268,7 +3268,7 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) if (ret) goto out; - if (f2fs_disable_compressed_file(inode)) { + if (!f2fs_disable_compressed_file(inode)) { ret = -EOPNOTSUPP; goto out; } From 12d139fea10f956a42be918bf78f6acc885f6ab7 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Wed, 12 Aug 2020 14:17:11 +0900 Subject: [PATCH 077/580] f2fs: change virtual mapping way for compression pages By profiling f2fs compression works, I've found vmap() callings have unexpected hikes in the execution time in our test environment and those are bottlenecks of f2fs decompression path. Changing these with vm_map_ram(), we can enhance f2fs decompression speed pretty much. [Verification] Android Pixel 3(ARM64, 6GB RAM, 128GB UFS) Turned on only 0-3 little cores(at 1.785GHz) dd if=/dev/zero of=dummy bs=1m count=1000 echo 3 > /proc/sys/vm/drop_caches dd if=dummy of=/dev/zero bs=512k - w/o compression - 1048576000 bytes (0.9 G) copied, 2.082554 s, 480 M/s 1048576000 bytes (0.9 G) copied, 2.081634 s, 480 M/s 1048576000 bytes (0.9 G) copied, 2.090861 s, 478 M/s - before patch - 1048576000 bytes (0.9 G) copied, 7.407527 s, 135 M/s 1048576000 bytes (0.9 G) copied, 7.283734 s, 137 M/s 1048576000 bytes (0.9 G) copied, 7.291508 s, 137 M/s - after patch - 1048576000 bytes (0.9 G) copied, 1.998959 s, 500 M/s 1048576000 bytes (0.9 G) copied, 1.987554 s, 503 M/s 1048576000 bytes (0.9 G) copied, 1.986380 s, 503 M/s Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index eaee9c99ceb2..3bf01eb84981 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -526,6 +526,22 @@ static void f2fs_compress_free_page(struct page *page) mempool_free(page, compress_page_pool); } +#define MAX_VMAP_RETRIES 3 + +static void *f2fs_vmap(struct page **pages, unsigned int count) +{ + int i; + void *buf = NULL; + + for (i = 0; i < MAX_VMAP_RETRIES; i++) { + buf = vm_map_ram(pages, count, -1, PAGE_KERNEL); + if (buf) + break; + vm_unmap_aliases(); + } + return buf; +} + static int f2fs_compress_pages(struct compress_ctx *cc) { struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); @@ -562,13 +578,13 @@ static int f2fs_compress_pages(struct compress_ctx *cc) } } - cc->rbuf = vmap(cc->rpages, cc->cluster_size, VM_MAP, PAGE_KERNEL_RO); + cc->rbuf = f2fs_vmap(cc->rpages, cc->cluster_size); if (!cc->rbuf) { ret = -ENOMEM; goto out_free_cpages; } - cc->cbuf = vmap(cc->cpages, cc->nr_cpages, VM_MAP, PAGE_KERNEL); + cc->cbuf = f2fs_vmap(cc->cpages, cc->nr_cpages); if (!cc->cbuf) { ret = -ENOMEM; goto out_vunmap_rbuf; @@ -596,8 +612,8 @@ static int f2fs_compress_pages(struct compress_ctx *cc) memset(&cc->cbuf->cdata[cc->clen], 0, (nr_cpages * PAGE_SIZE) - (cc->clen + COMPRESS_HEADER_SIZE)); - vunmap(cc->cbuf); - vunmap(cc->rbuf); + vm_unmap_ram(cc->cbuf, cc->nr_cpages); + vm_unmap_ram(cc->rbuf, cc->cluster_size); for (i = nr_cpages; i < cc->nr_cpages; i++) { f2fs_compress_free_page(cc->cpages[i]); @@ -614,9 +630,9 @@ static int f2fs_compress_pages(struct compress_ctx *cc) return 0; out_vunmap_cbuf: - vunmap(cc->cbuf); + vm_unmap_ram(cc->cbuf, cc->nr_cpages); out_vunmap_rbuf: - vunmap(cc->rbuf); + vm_unmap_ram(cc->rbuf, cc->cluster_size); out_free_cpages: for (i = 0; i < cc->nr_cpages; i++) { if (cc->cpages[i]) @@ -687,13 +703,13 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) goto out_free_dic; } - dic->rbuf = vmap(dic->tpages, dic->cluster_size, VM_MAP, PAGE_KERNEL); + dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size); if (!dic->rbuf) { ret = -ENOMEM; goto destroy_decompress_ctx; } - dic->cbuf = vmap(dic->cpages, dic->nr_cpages, VM_MAP, PAGE_KERNEL_RO); + dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages); if (!dic->cbuf) { ret = -ENOMEM; goto out_vunmap_rbuf; @@ -710,9 +726,9 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) ret = cops->decompress_pages(dic); out_vunmap_cbuf: - vunmap(dic->cbuf); + vm_unmap_ram(dic->cbuf, dic->nr_cpages); out_vunmap_rbuf: - vunmap(dic->rbuf); + vm_unmap_ram(dic->rbuf, dic->cluster_size); destroy_decompress_ctx: if (cops->destroy_decompress_ctx) cops->destroy_decompress_ctx(dic); From 0f1b8c8f8d4ac366cedfa2988f35bfa684e20389 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 14 Sep 2020 16:47:00 +0800 Subject: [PATCH 078/580] f2fs: clean up kvfree After commit 0b6d4ca04a86 ("f2fs: don't return vmalloc() memory from f2fs_kmalloc()"), f2fs_k{m,z}alloc() will not return vmalloc()'ed memory, so clean up to use kfree() instead of kvfree() to free vmalloc()'ed memory. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 6 +++--- fs/f2fs/data.c | 2 +- fs/f2fs/debug.c | 2 +- fs/f2fs/file.c | 2 +- fs/f2fs/gc.c | 4 ++-- fs/f2fs/inline.c | 4 ++-- fs/f2fs/namei.c | 2 +- fs/f2fs/node.c | 2 +- fs/f2fs/segment.c | 24 ++++++++++++------------ fs/f2fs/xattr.c | 8 ++++---- 10 files changed, 28 insertions(+), 28 deletions(-) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 217b290ae3a5..306413589827 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -160,7 +160,7 @@ static void *f2fs_acl_to_disk(struct f2fs_sb_info *sbi, return (void *)f2fs_acl; fail: - kvfree(f2fs_acl); + kfree(f2fs_acl); return ERR_PTR(-EINVAL); } @@ -190,7 +190,7 @@ static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type, acl = NULL; else acl = ERR_PTR(retval); - kvfree(value); + kfree(value); return acl; } @@ -240,7 +240,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, error = f2fs_setxattr(inode, name_index, "", value, size, ipage, 0); - kvfree(value); + kfree(value); if (!error) set_cached_acl(inode, type, acl); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 187224cd26f9..2498909bca05 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3524,7 +3524,7 @@ static void f2fs_dio_end_io(struct bio *bio) bio->bi_private = dio->orig_private; bio->bi_end_io = dio->orig_end_io; - kvfree(dio); + kfree(dio); bio_endio(bio); } diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index f3cc34db86d7..a8357fd4f5fa 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -574,7 +574,7 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi) list_del(&si->stat_list); mutex_unlock(&f2fs_stat_mutex); - kvfree(si); + kfree(si); } void __init f2fs_create_root_stats(void) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 9aa198643db6..8d9b4eb07806 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -3395,7 +3395,7 @@ static int f2fs_ioc_getfslabel(struct file *filp, unsigned long arg) min(FSLABEL_MAX, count))) err = -EFAULT; - kvfree(vbuf); + kfree(vbuf); return err; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 84b9dac942e3..05641a1e36cc 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -152,7 +152,7 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(gc_th->f2fs_gc_task)) { err = PTR_ERR(gc_th->f2fs_gc_task); - kvfree(gc_th); + kfree(gc_th); sbi->gc_thread = NULL; } out: @@ -165,7 +165,7 @@ void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi) if (!gc_th) return; kthread_stop(gc_th->f2fs_gc_task); - kvfree(gc_th); + kfree(gc_th); sbi->gc_thread = NULL; } diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index c4a51a3a257e..cd1cc890765c 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -523,7 +523,7 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, !f2fs_has_inline_xattr(dir)) F2FS_I(dir)->i_inline_xattr_size = 0; - kvfree(backup_dentry); + kfree(backup_dentry); return 0; recover: lock_page(ipage); @@ -534,7 +534,7 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, set_page_dirty(ipage); f2fs_put_page(ipage, 1); - kvfree(backup_dentry); + kfree(backup_dentry); return err; } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 84e4bbc1a64d..90565432559c 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -707,7 +707,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, f2fs_handle_failed_inode(inode); out_free_encrypted_link: if (disk_link.name != (unsigned char *)symname) - kvfree(disk_link.name); + kfree(disk_link.name); return err; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 38ebfc83caf7..5d9da09064c6 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -3257,7 +3257,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) kvfree(nm_i->nat_bitmap_mir); #endif sbi->nm_info = NULL; - kvfree(nm_i); + kfree(nm_i); } int __init f2fs_create_node_manager_caches(void) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5a9928c1ffb5..23f0cc300484 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -728,7 +728,7 @@ int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi) "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { err = PTR_ERR(fcc->f2fs_issue_flush); - kvfree(fcc); + kfree(fcc); SM_I(sbi)->fcc_info = NULL; return err; } @@ -747,7 +747,7 @@ void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) kthread_stop(flush_thread); } if (free) { - kvfree(fcc); + kfree(fcc); SM_I(sbi)->fcc_info = NULL; } } @@ -2104,7 +2104,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(dcc->f2fs_issue_discard)) { err = PTR_ERR(dcc->f2fs_issue_discard); - kvfree(dcc); + kfree(dcc); SM_I(sbi)->dcc_info = NULL; return err; } @@ -2128,7 +2128,7 @@ static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi) if (unlikely(atomic_read(&dcc->discard_cmd_cnt))) f2fs_issue_discard_timeout(sbi); - kvfree(dcc); + kfree(dcc); SM_I(sbi)->dcc_info = NULL; } @@ -4799,7 +4799,7 @@ static void destroy_dirty_segmap(struct f2fs_sb_info *sbi) destroy_victim_secmap(sbi); SM_I(sbi)->dirty_info = NULL; - kvfree(dirty_i); + kfree(dirty_i); } static void destroy_curseg(struct f2fs_sb_info *sbi) @@ -4811,10 +4811,10 @@ static void destroy_curseg(struct f2fs_sb_info *sbi) return; SM_I(sbi)->curseg_array = NULL; for (i = 0; i < NR_CURSEG_TYPE; i++) { - kvfree(array[i].sum_blk); - kvfree(array[i].journal); + kfree(array[i].sum_blk); + kfree(array[i].journal); } - kvfree(array); + kfree(array); } static void destroy_free_segmap(struct f2fs_sb_info *sbi) @@ -4825,7 +4825,7 @@ static void destroy_free_segmap(struct f2fs_sb_info *sbi) SM_I(sbi)->free_info = NULL; kvfree(free_i->free_segmap); kvfree(free_i->free_secmap); - kvfree(free_i); + kfree(free_i); } static void destroy_sit_info(struct f2fs_sb_info *sbi) @@ -4837,7 +4837,7 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) if (sit_i->sentries) kvfree(sit_i->bitmap); - kvfree(sit_i->tmp_map); + kfree(sit_i->tmp_map); kvfree(sit_i->sentries); kvfree(sit_i->sec_entries); @@ -4849,7 +4849,7 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) kvfree(sit_i->sit_bitmap_mir); kvfree(sit_i->invalid_segmap); #endif - kvfree(sit_i); + kfree(sit_i); } void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi) @@ -4865,7 +4865,7 @@ void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi) destroy_free_segmap(sbi); destroy_sit_info(sbi); sbi->sm_info = NULL; - kvfree(sm_info); + kfree(sm_info); } int __init f2fs_create_segment_manager_caches(void) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 1b0736ce0918..65afcc3cc68a 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -39,7 +39,7 @@ static void xattr_free(struct f2fs_sb_info *sbi, void *xattr_addr, if (is_inline) kmem_cache_free(sbi->inline_xattr_slab, xattr_addr); else - kvfree(xattr_addr); + kfree(xattr_addr); } static int f2fs_xattr_generic_get(const struct xattr_handler *handler, @@ -425,7 +425,7 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, *base_addr = txattr_addr; return 0; fail: - kvfree(txattr_addr); + kfree(txattr_addr); return err; } @@ -610,7 +610,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) } error = buffer_size - rest; cleanup: - kvfree(base_addr); + kfree(base_addr); return error; } @@ -750,7 +750,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (!error && S_ISDIR(inode->i_mode)) set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP); exit: - kvfree(base_addr); + kfree(base_addr); return error; } From 3dc690bc8b994a9f7a5fe120576990d5e7750299 Mon Sep 17 00:00:00 2001 From: Xiaojun Wang Date: Wed, 16 Sep 2020 16:00:46 +0800 Subject: [PATCH 079/580] f2fs: change return value of reserved_segments to unsigned int The type of SM_I(sbi)->reserved_segments is unsigned int, so change the return value to unsigned int. The type cast can be removed in reserved_sections as a result. Signed-off-by: Xiaojun Wang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 47b888ad913b..f5b1883eda7a 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -524,7 +524,7 @@ static inline unsigned int free_segments(struct f2fs_sb_info *sbi) return FREE_I(sbi)->free_segments; } -static inline int reserved_segments(struct f2fs_sb_info *sbi) +static inline unsigned int reserved_segments(struct f2fs_sb_info *sbi) { return SM_I(sbi)->reserved_segments; } @@ -556,7 +556,7 @@ static inline int overprovision_segments(struct f2fs_sb_info *sbi) static inline int reserved_sections(struct f2fs_sb_info *sbi) { - return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi)); + return GET_SEC_FROM_SEG(sbi, reserved_segments(sbi)); } static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi) From d8ce60af2f8146ce2be4b914c484c5c124362847 Mon Sep 17 00:00:00 2001 From: Zhang Qilong Date: Mon, 21 Sep 2020 20:45:44 +0800 Subject: [PATCH 080/580] f2fs: add trace exit in exception path Missing the trace exit in f2fs_sync_dirty_inodes Signed-off-by: Zhang Qilong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 6059ce3758d8..3c6fd7a2a819 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1047,8 +1047,12 @@ int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) get_pages(sbi, is_dir ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); retry: - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { + trace_f2fs_sync_dirty_inodes_exit(sbi->sb, is_dir, + get_pages(sbi, is_dir ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); return -EIO; + } spin_lock(&sbi->inode_lock[type]); From a31ca2171d30e48df06bd5852f6583a452e5b582 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 21 Sep 2020 20:53:13 +0800 Subject: [PATCH 081/580] f2fs: do sanity check on zoned block device path sbi->devs would be initialized only if image enables multiple device feature or blkzoned feature, if blkzoned feature flag was set by fuzz in non-blkzoned device, we will suffer below panic: get_zone_idx fs/f2fs/segment.c:4892 [inline] f2fs_usable_zone_blks_in_seg fs/f2fs/segment.c:4943 [inline] f2fs_usable_blks_in_seg+0x39b/0xa00 fs/f2fs/segment.c:4999 Call Trace: check_block_count+0x69/0x4e0 fs/f2fs/segment.h:704 build_sit_entries fs/f2fs/segment.c:4403 [inline] f2fs_build_segment_manager+0x51da/0xa370 fs/f2fs/segment.c:5100 f2fs_fill_super+0x3880/0x6ff0 fs/f2fs/super.c:3684 mount_bdev+0x32e/0x3f0 fs/super.c:1417 legacy_get_tree+0x105/0x220 fs/fs_context.c:592 vfs_get_tree+0x89/0x2f0 fs/super.c:1547 do_new_mount fs/namespace.c:2896 [inline] path_mount+0x12ae/0x1e70 fs/namespace.c:3216 do_mount fs/namespace.c:3229 [inline] __do_sys_mount fs/namespace.c:3437 [inline] __se_sys_mount fs/namespace.c:3414 [inline] __x64_sys_mount+0x27f/0x300 fs/namespace.c:3414 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 Add sanity check to inconsistency on factors: blkzoned flag, device path and device character to avoid above panic. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e70bd085f115..b61714dfef9a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2816,6 +2816,12 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, segment_count, dev_seg_count); return -EFSCORRUPTED; } + } else { + if (__F2FS_HAS_FEATURE(raw_super, F2FS_FEATURE_BLKZONED) && + !bdev_is_zoned(sbi->sb->s_bdev)) { + f2fs_info(sbi, "Zoned block device path is missing"); + return -EFSCORRUPTED; + } } if (secs_per_zone > total_sections || !secs_per_zone) { From 719dc3ea0f7b63b3033a65b697d8f892a36bc0f3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 21 Sep 2020 20:53:14 +0800 Subject: [PATCH 082/580] f2fs: relocate blkzoned feature check Relocate blkzoned feature check into parse_options() like other feature check. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b61714dfef9a..19aebe1e6a0e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -957,6 +957,17 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) return -EINVAL; } #endif + /* + * The BLKZONED feature indicates that the drive was formatted with + * zone alignment optimization. This is optional for host-aware + * devices, but mandatory for host-managed zoned block devices. + */ +#ifndef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_has_blkzoned(sbi)) { + f2fs_err(sbi, "Zoned block device support is not enabled"); + return -EINVAL; + } +#endif if (F2FS_IO_SIZE_BITS(sbi) && !f2fs_lfs_mode(sbi)) { f2fs_err(sbi, "Should set mode=lfs with %uKB-sized IO", @@ -3451,18 +3462,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid, sizeof(raw_super->uuid)); - /* - * The BLKZONED feature indicates that the drive was formatted with - * zone alignment optimization. This is optional for host-aware - * devices, but mandatory for host-managed zoned block devices. - */ -#ifndef CONFIG_BLK_DEV_ZONED - if (f2fs_sb_has_blkzoned(sbi)) { - f2fs_err(sbi, "Zoned block device support is not enabled"); - err = -EOPNOTSUPP; - goto free_sb_buf; - } -#endif default_options(sbi); /* parse mount options */ options = kstrdup((const char *)data, GFP_KERNEL); From 575b3a1bd0b62d71e2fadf31dcdf3bcc783af79d Mon Sep 17 00:00:00 2001 From: Wang Xiaojun Date: Sat, 19 Sep 2020 11:35:05 +0800 Subject: [PATCH 083/580] f2fs: remove unused check on version_bitmap A NULL will not be return by __bitmap_ptr here. Remove the unused check. Signed-off-by: Wang Xiaojun Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 5d9da09064c6..c9c146a43931 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -3105,9 +3105,6 @@ static int init_node_manager(struct f2fs_sb_info *sbi) nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP); version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP); - if (!version_bitmap) - return -EFAULT; - nm_i->nat_bitmap = kmemdup(version_bitmap, nm_i->bitmap_size, GFP_KERNEL); if (!nm_i->nat_bitmap) From 4011fb376130dcbfa73f844219ec7ea63360679a Mon Sep 17 00:00:00 2001 From: Wang Xiaojun Date: Fri, 18 Sep 2020 08:31:24 +0800 Subject: [PATCH 084/580] f2fs: remove duplicated code in sanity_check_area_boundary Use seg_end_blkaddr instead of "segment0_blkaddr + (segment_count << log_blocks_per_seg)". Signed-off-by: Wang Xiaojun Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 19aebe1e6a0e..131aa241799d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2675,10 +2675,8 @@ static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, } if (main_end_blkaddr > seg_end_blkaddr) { - f2fs_info(sbi, "Wrong MAIN_AREA boundary, start(%u) end(%u) block(%u)", - main_blkaddr, - segment0_blkaddr + - (segment_count << log_blocks_per_seg), + f2fs_info(sbi, "Wrong MAIN_AREA boundary, start(%u) end(%llu) block(%u)", + main_blkaddr, seg_end_blkaddr, segment_count_main << log_blocks_per_seg); return true; } else if (main_end_blkaddr < seg_end_blkaddr) { @@ -2696,10 +2694,8 @@ static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, err = __f2fs_commit_super(bh, NULL); res = err ? "failed" : "done"; } - f2fs_info(sbi, "Fix alignment : %s, start(%u) end(%u) block(%u)", - res, main_blkaddr, - segment0_blkaddr + - (segment_count << log_blocks_per_seg), + f2fs_info(sbi, "Fix alignment : %s, start(%u) end(%llu) block(%u)", + res, main_blkaddr, seg_end_blkaddr, segment_count_main << log_blocks_per_seg); if (err) return true; From 18d143111feb5ef93e1a0c03be207a8874349897 Mon Sep 17 00:00:00 2001 From: Wang Xiaojun Date: Thu, 17 Sep 2020 19:11:58 +0800 Subject: [PATCH 085/580] f2fs: fix wrong total_sections check and fsmeta check Meta area is not included in section_count computation. So the minimum number of total_sections is 1 meanwhile it cannot be greater than segment_count_main. The minimum number of meta segments is 8 (SB + 2 (CP + SIT + NAT) + SSA). Signed-off-by: Wang Xiaojun Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 1 + fs/f2fs/super.c | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index f5b1883eda7a..e81eb0748e2a 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -16,6 +16,7 @@ #define DEF_MAX_RECLAIM_PREFREE_SEGMENTS 4096 /* 8GB in maximum */ #define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */ +#define F2FS_MIN_META_SEGMENTS 8 /* SB + 2 (CP + SIT + NAT) + SSA */ /* L: Logical segment # in volume, R: Relative segment # in main area */ #define GET_L2R_SEGNO(free_i, segno) ((segno) - (free_i)->start_segno) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 131aa241799d..888905474b13 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2706,7 +2706,7 @@ static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, static int sanity_check_raw_super(struct f2fs_sb_info *sbi, struct buffer_head *bh) { - block_t segment_count, segs_per_sec, secs_per_zone; + block_t segment_count, segs_per_sec, secs_per_zone, segment_count_main; block_t total_sections, blocks_per_seg; struct f2fs_super_block *raw_super = (struct f2fs_super_block *) (bh->b_data + F2FS_SUPER_OFFSET); @@ -2777,6 +2777,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, } segment_count = le32_to_cpu(raw_super->segment_count); + segment_count_main = le32_to_cpu(raw_super->segment_count_main); segs_per_sec = le32_to_cpu(raw_super->segs_per_sec); secs_per_zone = le32_to_cpu(raw_super->secs_per_zone); total_sections = le32_to_cpu(raw_super->section_count); @@ -2790,8 +2791,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return -EFSCORRUPTED; } - if (total_sections > segment_count || - total_sections < F2FS_MIN_SEGMENTS || + if (total_sections > segment_count_main || total_sections < 1 || segs_per_sec > segment_count || !segs_per_sec) { f2fs_info(sbi, "Invalid segment/section count (%u, %u x %u)", segment_count, total_sections, segs_per_sec); @@ -2904,7 +2904,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count); - if (unlikely(fsmeta < F2FS_MIN_SEGMENTS || + if (unlikely(fsmeta < F2FS_MIN_META_SEGMENTS || ovp_segments == 0 || reserved_segments == 0)) { f2fs_err(sbi, "Wrong layout: check mkfs.f2fs version"); return 1; From e991dcaec2a3b0e6358b74cff258ca62142cdc08 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 26 Sep 2020 08:20:46 +0800 Subject: [PATCH 086/580] f2fs: remove unneeded parameter in find_in_block() We can relocate @res_page assignment in find_in_block() to its caller, so unneeded parameter could be removed for cleanup. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index f2f109493b4f..531afc84f890 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -191,21 +191,15 @@ static unsigned long dir_block_index(unsigned int level, static struct f2fs_dir_entry *find_in_block(struct inode *dir, struct page *dentry_page, const struct f2fs_filename *fname, - int *max_slots, - struct page **res_page) + int *max_slots) { struct f2fs_dentry_block *dentry_blk; - struct f2fs_dir_entry *de; struct f2fs_dentry_ptr d; dentry_blk = (struct f2fs_dentry_block *)page_address(dentry_page); make_dentry_ptr_block(dir, &d, dentry_blk); - de = f2fs_find_target_dentry(&d, fname, max_slots); - if (de) - *res_page = dentry_page; - - return de; + return f2fs_find_target_dentry(&d, fname, max_slots); } #ifdef CONFIG_UNICODE @@ -331,10 +325,11 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, } } - de = find_in_block(dir, dentry_page, fname, &max_slots, - res_page); - if (de) + de = find_in_block(dir, dentry_page, fname, &max_slots); + if (de) { + *res_page = dentry_page; break; + } if (max_slots >= s) room = true; From 6d738c049ed94302e11b92680ed1669b61d35c68 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 1 Jun 2020 21:45:36 -0700 Subject: [PATCH 087/580] vfs: track per-sb writeback errors and report them to syncfs Patch series "vfs: have syncfs() return error when there are writeback errors", v6. Currently, syncfs does not return errors when one of the inodes fails to be written back. It will return errors based on the legacy AS_EIO and AS_ENOSPC flags when syncing out the block device fails, but that's not particularly helpful for filesystems that aren't backed by a blockdev. It's also possible for a stray sync to lose those errors. The basic idea in this set is to track writeback errors at the superblock level, so that we can quickly and easily check whether something bad happened without having to fsync each file individually. syncfs is then changed to reliably report writeback errors after they occur, much in the same fashion as fsync does now. This patch (of 2): Usually we suggest that applications call fsync when they want to ensure that all data written to the file has made it to the backing store, but that can be inefficient when there are a lot of open files. Calling syncfs on the filesystem can be more efficient in some situations, but the error reporting doesn't currently work the way most people expect. If a single inode on a filesystem reports a writeback error, syncfs won't necessarily return an error. syncfs only returns an error if __sync_blockdev fails, and on some filesystems that's a no-op. It would be better if syncfs reported an error if there were any writeback failures. Then applications could call syncfs to see if there are any errors on any open files, and could then call fsync on all of the other descriptors to figure out which one failed. This patch adds a new errseq_t to struct super_block, and has mapping_set_error also record writeback errors there. To report those errors, we also need to keep an errseq_t in struct file to act as a cursor. This patch adds a dedicated field for that purpose, which slots nicely into 4 bytes of padding at the end of struct file on x86_64. An earlier version of this patch used an O_PATH file descriptor to cue the kernel that the open file should track the superblock error and not the inode's writeback error. I think that API is just too weird though. This is simpler and should make syncfs error reporting "just work" even if someone is multiplexing fsync and syncfs on the same fds. Signed-off-by: Jeff Layton Signed-off-by: Andrew Morton Reviewed-by: Jan Kara Cc: Andres Freund Cc: Matthew Wilcox Cc: Al Viro Cc: Christoph Hellwig Cc: Dave Chinner Cc: David Howells Link: http://lkml.kernel.org/r/20200428135155.19223-1-jlayton@kernel.org Link: http://lkml.kernel.org/r/20200428135155.19223-2-jlayton@kernel.org Signed-off-by: Linus Torvalds --- drivers/dax/device.c | 1 + fs/file_table.c | 1 + fs/open.c | 3 +-- fs/sync.c | 6 ++++-- include/linux/fs.h | 16 ++++++++++++++++ include/linux/pagemap.h | 5 ++++- 6 files changed, 27 insertions(+), 5 deletions(-) diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 948806e57cee..fdf17eca6b92 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -552,6 +552,7 @@ static int dax_open(struct inode *inode, struct file *filp) inode->i_mapping->a_ops = &dev_dax_aops; filp->f_mapping = inode->i_mapping; filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); + filp->f_sb_err = file_sample_sb_err(filp); filp->private_data = dev_dax; inode->i_flags = S_DAX; diff --git a/fs/file_table.c b/fs/file_table.c index e49af4caf15d..768f954bf599 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -197,6 +197,7 @@ static struct file *alloc_file(const struct path *path, int flags, file->f_inode = path->dentry->d_inode; file->f_mapping = path->dentry->d_inode->i_mapping; file->f_wb_err = filemap_sample_wb_err(file->f_mapping); + file->f_sb_err = file_sample_sb_err(file); if ((file->f_mode & FMODE_READ) && likely(fop->read || fop->read_iter)) file->f_mode |= FMODE_CAN_READ; diff --git a/fs/open.c b/fs/open.c index 0285ce7dbd51..52d891b4c3b4 100644 --- a/fs/open.c +++ b/fs/open.c @@ -723,9 +723,8 @@ static int do_dentry_open(struct file *f, path_get(&f->f_path); f->f_inode = inode; f->f_mapping = inode->i_mapping; - - /* Ensure that we skip any errors that predate opening of the file */ f->f_wb_err = filemap_sample_wb_err(f->f_mapping); + f->f_sb_err = file_sample_sb_err(f); if (unlikely(f->f_flags & O_PATH)) { f->f_mode = FMODE_PATH | FMODE_OPENED; diff --git a/fs/sync.c b/fs/sync.c index b54e0541ad89..73343bec5d9a 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -161,7 +161,7 @@ SYSCALL_DEFINE1(syncfs, int, fd) { struct fd f = fdget(fd); struct super_block *sb; - int ret; + int ret, ret2; if (!f.file) return -EBADF; @@ -171,8 +171,10 @@ SYSCALL_DEFINE1(syncfs, int, fd) ret = sync_filesystem(sb); up_read(&sb->s_umount); + ret2 = errseq_check_and_advance(&sb->s_wb_err, &f.file->f_sb_err); + fdput(f); - return ret; + return ret ? ret : ret2; } /** diff --git a/include/linux/fs.h b/include/linux/fs.h index 2d3382a8729d..c192bb8ebfb3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -923,6 +923,7 @@ struct file { #endif /* #ifdef CONFIG_EPOLL */ struct address_space *f_mapping; errseq_t f_wb_err; + errseq_t f_sb_err; /* for syncfs */ } __randomize_layout __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ @@ -1451,6 +1452,9 @@ struct super_block { /* Being remounted read-only */ int s_readonly_remount; + /* per-sb errseq_t for reporting writeback errors via syncfs */ + errseq_t s_wb_err; + /* AIO completions deferred from interrupt context */ struct workqueue_struct *s_dio_done_wq; struct hlist_head s_pins; @@ -2744,6 +2748,18 @@ static inline errseq_t filemap_sample_wb_err(struct address_space *mapping) return errseq_sample(&mapping->wb_err); } +/** + * file_sample_sb_err - sample the current errseq_t to test for later errors + * @mapping: mapping to be sampled + * + * Grab the most current superblock-level errseq_t value for the given + * struct file. + */ +static inline errseq_t file_sample_sb_err(struct file *file) +{ + return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err); +} + extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync); extern int vfs_fsync(struct file *file, int datasync); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index b1bd2186e6d2..2de87c5a2718 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -51,7 +51,10 @@ static inline void mapping_set_error(struct address_space *mapping, int error) return; /* Record in wb_err for checkers using errseq_t based tracking */ - filemap_set_wb_err(mapping, error); + __filemap_set_wb_err(mapping, error); + + /* Record it in superblock */ + errseq_set(&mapping->host->i_sb->s_wb_err, error); /* Record it in flags for now, for legacy callers */ if (error == -ENOSPC) From aacd59c5cb7d4b46a196d5b507f3bb22fa0b27bf Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 29 Sep 2020 09:22:50 +0800 Subject: [PATCH 088/580] f2fs: fix uninit-value in f2fs_lookup As syzbot reported: Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x21c/0x280 lib/dump_stack.c:118 kmsan_report+0xf7/0x1e0 mm/kmsan/kmsan_report.c:122 __msan_warning+0x58/0xa0 mm/kmsan/kmsan_instr.c:219 f2fs_lookup+0xe05/0x1a80 fs/f2fs/namei.c:503 lookup_open fs/namei.c:3082 [inline] open_last_lookups fs/namei.c:3177 [inline] path_openat+0x2729/0x6a90 fs/namei.c:3365 do_filp_open+0x2b8/0x710 fs/namei.c:3395 do_sys_openat2+0xa88/0x1140 fs/open.c:1168 do_sys_open fs/open.c:1184 [inline] __do_compat_sys_openat fs/open.c:1242 [inline] __se_compat_sys_openat+0x2a4/0x310 fs/open.c:1240 __ia32_compat_sys_openat+0x56/0x70 fs/open.c:1240 do_syscall_32_irqs_on arch/x86/entry/common.c:80 [inline] __do_fast_syscall_32+0x129/0x180 arch/x86/entry/common.c:139 do_fast_syscall_32+0x6a/0xc0 arch/x86/entry/common.c:162 do_SYSENTER_32+0x73/0x90 arch/x86/entry/common.c:205 entry_SYSENTER_compat_after_hwframe+0x4d/0x5c In f2fs_lookup(), @res_page could be used before being initialized, because in __f2fs_find_entry(), once F2FS_I(dir)->i_current_depth was been fuzzed to zero, then @res_page will never be initialized, causing this kmsan warning, relocating @res_page initialization place to fix this bug. Reported-by: syzbot+0eac6f0bbd558fd866d7@syzkaller.appspotmail.com Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 531afc84f890..7808cf512041 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -353,16 +353,15 @@ struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, unsigned int max_depth; unsigned int level; + *res_page = NULL; + if (f2fs_has_inline_dentry(dir)) { - *res_page = NULL; de = f2fs_find_in_inline_dir(dir, fname, res_page); goto out; } - if (npages == 0) { - *res_page = NULL; + if (npages == 0) goto out; - } max_depth = F2FS_I(dir)->i_current_depth; if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) { @@ -373,7 +372,6 @@ struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, } for (level = 0; level < max_depth; level++) { - *res_page = NULL; de = find_in_level(dir, level, fname, res_page); if (de || IS_ERR(*res_page)) break; From 5452ff8f53ba061395e2646d2331cbfc27d77050 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 29 Sep 2020 09:23:12 +0800 Subject: [PATCH 089/580] f2fs: fix to check segment boundary during SIT page readahead As syzbot reported: kernel BUG at fs/f2fs/segment.h:657! invalid opcode: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 16220 Comm: syz-executor.0 Not tainted 5.9.0-rc5-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:f2fs_ra_meta_pages+0xa51/0xdc0 fs/f2fs/segment.h:657 Call Trace: build_sit_entries fs/f2fs/segment.c:4195 [inline] f2fs_build_segment_manager+0x4b8a/0xa3c0 fs/f2fs/segment.c:4779 f2fs_fill_super+0x377d/0x6b80 fs/f2fs/super.c:3633 mount_bdev+0x32e/0x3f0 fs/super.c:1417 legacy_get_tree+0x105/0x220 fs/fs_context.c:592 vfs_get_tree+0x89/0x2f0 fs/super.c:1547 do_new_mount fs/namespace.c:2875 [inline] path_mount+0x1387/0x2070 fs/namespace.c:3192 do_mount fs/namespace.c:3205 [inline] __do_sys_mount fs/namespace.c:3413 [inline] __se_sys_mount fs/namespace.c:3390 [inline] __x64_sys_mount+0x27f/0x300 fs/namespace.c:3390 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 @blkno in f2fs_ra_meta_pages could exceed max segment count, causing panic in following sanity check in current_sit_addr(), add check condition to avoid this issue. Reported-by: syzbot+3698081bcf0bb2d12174@syzkaller.appspotmail.com Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 3c6fd7a2a819..f18386d30f03 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -243,6 +243,8 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, blkno * NAT_ENTRY_PER_BLOCK); break; case META_SIT: + if (unlikely(blkno >= TOTAL_SEGS(sbi))) + goto out; /* get sit block addr */ fio.new_blkaddr = current_sit_addr(sbi, blkno * SIT_ENTRY_PER_BLOCK); From f7a1aee2c8a35749916482b9335302b49a8da3f2 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 29 Sep 2020 09:23:34 +0800 Subject: [PATCH 090/580] f2fs: fix to do sanity check on segment/section count As syzbot reported: BUG: KASAN: slab-out-of-bounds in init_min_max_mtime fs/f2fs/segment.c:4710 [inline] BUG: KASAN: slab-out-of-bounds in f2fs_build_segment_manager+0x9302/0xa6d0 fs/f2fs/segment.c:4792 Read of size 8 at addr ffff8880a1b934a8 by task syz-executor682/6878 CPU: 1 PID: 6878 Comm: syz-executor682 Not tainted 5.9.0-rc6-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x198/0x1fd lib/dump_stack.c:118 print_address_description.constprop.0.cold+0xae/0x497 mm/kasan/report.c:383 __kasan_report mm/kasan/report.c:513 [inline] kasan_report.cold+0x1f/0x37 mm/kasan/report.c:530 init_min_max_mtime fs/f2fs/segment.c:4710 [inline] f2fs_build_segment_manager+0x9302/0xa6d0 fs/f2fs/segment.c:4792 f2fs_fill_super+0x381a/0x6e80 fs/f2fs/super.c:3633 mount_bdev+0x32e/0x3f0 fs/super.c:1417 legacy_get_tree+0x105/0x220 fs/fs_context.c:592 vfs_get_tree+0x89/0x2f0 fs/super.c:1547 do_new_mount fs/namespace.c:2875 [inline] path_mount+0x1387/0x20a0 fs/namespace.c:3192 do_mount fs/namespace.c:3205 [inline] __do_sys_mount fs/namespace.c:3413 [inline] __se_sys_mount fs/namespace.c:3390 [inline] __x64_sys_mount+0x27f/0x300 fs/namespace.c:3390 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The root cause is: if segs_per_sec is larger than one, and segment count in last section is less than segs_per_sec, we will suffer out-of-boundary memory access on sit_i->sentries[] in init_min_max_mtime(). Fix this by adding sanity check among segment count, section count and segs_per_sec value in sanity_check_raw_super(). Reported-by: syzbot+481a3ffab50fed41dcc0@syzkaller.appspotmail.com Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 888905474b13..426ef4f2eeb8 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2798,6 +2798,12 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return -EFSCORRUPTED; } + if (segment_count_main != total_sections * segs_per_sec) { + f2fs_info(sbi, "Invalid segment/section count (%u != %u * %u)", + segment_count_main, total_sections, segs_per_sec); + return -EFSCORRUPTED; + } + if ((segment_count / segs_per_sec) < total_sections) { f2fs_info(sbi, "Small segment_count (%u < %u * %u)", segment_count, segs_per_sec, total_sections); From 4c139cdc5353734e01f33e10038ca4a69e2389d8 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 14 Sep 2020 17:05:13 +0800 Subject: [PATCH 091/580] f2fs: compress: introduce page array slab cache Add a per-sbi slab cache "f2fs_page_array_entry-%u:%u" for memory allocation of page pointer array in compress context. Signed-off-by: Chao Yu [Jaegeuk Kim: Fix wrong memory allocation] Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 116 +++++++++++++++++++++++++++++++++------------ fs/f2fs/f2fs.h | 9 ++++ fs/f2fs/super.c | 8 +++- 3 files changed, 102 insertions(+), 31 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 3bf01eb84981..92a86dca5011 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -18,6 +18,30 @@ #include "node.h" #include +static void *page_array_alloc(struct inode *inode, int nr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned int size = sizeof(struct page *) * nr; + + if (likely(size <= sbi->page_array_slab_size)) + return kmem_cache_zalloc(sbi->page_array_slab, GFP_NOFS); + return f2fs_kzalloc(sbi, size, GFP_NOFS); +} + +static void page_array_free(struct inode *inode, void *pages, int nr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned int size = sizeof(struct page *) * nr; + + if (!pages) + return; + + if (likely(size <= sbi->page_array_slab_size)) + kmem_cache_free(sbi->page_array_slab, pages); + else + kfree(pages); +} + struct f2fs_compress_ops { int (*init_compress_ctx)(struct compress_ctx *cc); void (*destroy_compress_ctx)(struct compress_ctx *cc); @@ -131,19 +155,16 @@ struct page *f2fs_compress_control_page(struct page *page) int f2fs_init_compress_ctx(struct compress_ctx *cc) { - struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); - if (cc->nr_rpages) return 0; - cc->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) << - cc->log_cluster_size, GFP_NOFS); + cc->rpages = page_array_alloc(cc->inode, cc->cluster_size); return cc->rpages ? 0 : -ENOMEM; } void f2fs_destroy_compress_ctx(struct compress_ctx *cc) { - kfree(cc->rpages); + page_array_free(cc->inode, cc->rpages, cc->cluster_size); cc->rpages = NULL; cc->nr_rpages = 0; cc->nr_cpages = 0; @@ -544,11 +565,11 @@ static void *f2fs_vmap(struct page **pages, unsigned int count) static int f2fs_compress_pages(struct compress_ctx *cc) { - struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); struct f2fs_inode_info *fi = F2FS_I(cc->inode); const struct f2fs_compress_ops *cops = f2fs_cops[fi->i_compress_algorithm]; - unsigned int max_len, nr_cpages; + unsigned int max_len, new_nr_cpages; + struct page **new_cpages; int i, ret; trace_f2fs_compress_pages_start(cc->inode, cc->cluster_idx, @@ -563,8 +584,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc) max_len = COMPRESS_HEADER_SIZE + cc->clen; cc->nr_cpages = DIV_ROUND_UP(max_len, PAGE_SIZE); - cc->cpages = f2fs_kzalloc(sbi, sizeof(struct page *) * - cc->nr_cpages, GFP_NOFS); + cc->cpages = page_array_alloc(cc->inode, cc->nr_cpages); if (!cc->cpages) { ret = -ENOMEM; goto destroy_compress_ctx; @@ -606,16 +626,28 @@ static int f2fs_compress_pages(struct compress_ctx *cc) for (i = 0; i < COMPRESS_DATA_RESERVED_SIZE; i++) cc->cbuf->reserved[i] = cpu_to_le32(0); - nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE); + new_nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE); + + /* Now we're going to cut unnecessary tail pages */ + new_cpages = page_array_alloc(cc->inode, new_nr_cpages); + if (!new_cpages) { + ret = -ENOMEM; + goto out_vunmap_cbuf; + } /* zero out any unused part of the last page */ memset(&cc->cbuf->cdata[cc->clen], 0, - (nr_cpages * PAGE_SIZE) - (cc->clen + COMPRESS_HEADER_SIZE)); + (new_nr_cpages * PAGE_SIZE) - + (cc->clen + COMPRESS_HEADER_SIZE)); vm_unmap_ram(cc->cbuf, cc->nr_cpages); vm_unmap_ram(cc->rbuf, cc->cluster_size); - for (i = nr_cpages; i < cc->nr_cpages; i++) { + for (i = 0; i < cc->nr_cpages; i++) { + if (i < new_nr_cpages) { + new_cpages[i] = cc->cpages[i]; + continue; + } f2fs_compress_free_page(cc->cpages[i]); cc->cpages[i] = NULL; } @@ -623,7 +655,9 @@ static int f2fs_compress_pages(struct compress_ctx *cc) if (cops->destroy_compress_ctx) cops->destroy_compress_ctx(cc); - cc->nr_cpages = nr_cpages; + page_array_free(cc->inode, cc->cpages, cc->nr_cpages); + cc->cpages = new_cpages; + cc->nr_cpages = new_nr_cpages; trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx, cc->clen, ret); @@ -638,7 +672,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc) if (cc->cpages[i]) f2fs_compress_free_page(cc->cpages[i]); } - kfree(cc->cpages); + page_array_free(cc->inode, cc->cpages, cc->nr_cpages); cc->cpages = NULL; destroy_compress_ctx: if (cops->destroy_compress_ctx) @@ -677,8 +711,7 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) goto out_free_dic; } - dic->tpages = f2fs_kzalloc(sbi, sizeof(struct page *) * - dic->cluster_size, GFP_NOFS); + dic->tpages = page_array_alloc(dic->inode, dic->cluster_size); if (!dic->tpages) { ret = -ENOMEM; goto out_free_dic; @@ -1017,6 +1050,7 @@ bool f2fs_compress_write_end(struct inode *inode, void *fsdata, { struct compress_ctx cc = { + .inode = inode, .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, .cluster_size = F2FS_I(inode)->i_cluster_size, .rpages = fsdata, @@ -1120,7 +1154,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, */ down_read(&sbi->node_write); } else if (!f2fs_trylock_op(sbi)) { - return -EAGAIN; + goto out_free; } set_new_dnode(&dn, cc->inode, NULL, NULL, 0); @@ -1150,8 +1184,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, cic->magic = F2FS_COMPRESSED_PAGE_MAGIC; cic->inode = inode; atomic_set(&cic->pending_pages, cc->nr_cpages); - cic->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) << - cc->log_cluster_size, GFP_NOFS); + cic->rpages = page_array_alloc(cc->inode, cc->cluster_size); if (!cic->rpages) goto out_put_cic; @@ -1245,11 +1278,13 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, spin_unlock(&fi->i_size_lock); f2fs_put_rpages(cc); + page_array_free(cc->inode, cc->cpages, cc->nr_cpages); + cc->cpages = NULL; f2fs_destroy_compress_ctx(cc); return 0; out_destroy_crypt: - kfree(cic->rpages); + page_array_free(cc->inode, cic->rpages, cc->cluster_size); for (--i; i >= 0; i--) fscrypt_finalize_bounce_page(&cc->cpages[i]); @@ -1267,6 +1302,9 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, up_read(&sbi->node_write); else f2fs_unlock_op(sbi); +out_free: + page_array_free(cc->inode, cc->cpages, cc->nr_cpages); + cc->cpages = NULL; return -EAGAIN; } @@ -1293,7 +1331,7 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page) end_page_writeback(cic->rpages[i]); } - kfree(cic->rpages); + page_array_free(cic->inode, cic->rpages, cic->nr_rpages); kfree(cic); } @@ -1390,8 +1428,6 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, err = f2fs_write_compressed_pages(cc, submitted, wbc, io_type); - kfree(cc->cpages); - cc->cpages = NULL; if (!err) return 0; f2fs_bug_on(F2FS_I_SB(cc->inode), err != -EAGAIN); @@ -1417,8 +1453,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) if (!dic) return ERR_PTR(-ENOMEM); - dic->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) << - cc->log_cluster_size, GFP_NOFS); + dic->rpages = page_array_alloc(cc->inode, cc->cluster_size); if (!dic->rpages) { kfree(dic); return ERR_PTR(-ENOMEM); @@ -1437,8 +1472,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) dic->rpages[i] = cc->rpages[i]; dic->nr_rpages = cc->cluster_size; - dic->cpages = f2fs_kzalloc(sbi, sizeof(struct page *) * - dic->nr_cpages, GFP_NOFS); + dic->cpages = page_array_alloc(dic->inode, dic->nr_cpages); if (!dic->cpages) goto out_free; @@ -1473,7 +1507,7 @@ void f2fs_free_dic(struct decompress_io_ctx *dic) continue; f2fs_compress_free_page(dic->tpages[i]); } - kfree(dic->tpages); + page_array_free(dic->inode, dic->tpages, dic->cluster_size); } if (dic->cpages) { @@ -1482,10 +1516,10 @@ void f2fs_free_dic(struct decompress_io_ctx *dic) continue; f2fs_compress_free_page(dic->cpages[i]); } - kfree(dic->cpages); + page_array_free(dic->inode, dic->cpages, dic->nr_cpages); } - kfree(dic->rpages); + page_array_free(dic->inode, dic->rpages, dic->nr_rpages); kfree(dic); } @@ -1514,3 +1548,25 @@ void f2fs_decompress_end_io(struct page **rpages, unlock_page(rpage); } } + +int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) +{ + dev_t dev = sbi->sb->s_bdev->bd_dev; + char slab_name[32]; + + sprintf(slab_name, "f2fs_page_array_entry-%u:%u", MAJOR(dev), MINOR(dev)); + + sbi->page_array_slab_size = sizeof(struct page *) << + F2FS_OPTION(sbi).compress_log_size; + + sbi->page_array_slab = f2fs_kmem_cache_create(slab_name, + sbi->page_array_slab_size); + if (!sbi->page_array_slab) + return -ENOMEM; + return 0; +} + +void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) +{ + kmem_cache_destroy(sbi->page_array_slab); +} diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1ef35106c9ae..382185d91270 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1619,6 +1619,11 @@ struct f2fs_sb_info { struct kmem_cache *inline_xattr_slab; /* inline xattr entry */ unsigned int inline_xattr_slab_size; /* default inline xattr slab size */ + +#ifdef CONFIG_F2FS_FS_COMPRESSION + struct kmem_cache *page_array_slab; /* page array entry */ + unsigned int page_array_slab_size; /* default page array slab size */ +#endif }; struct f2fs_private_dio { @@ -3935,6 +3940,8 @@ void f2fs_decompress_end_io(struct page **rpages, int f2fs_init_compress_ctx(struct compress_ctx *cc); void f2fs_destroy_compress_ctx(struct compress_ctx *cc); void f2fs_init_compress_info(struct f2fs_sb_info *sbi); +int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi); +void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi); #else static inline bool f2fs_is_compressed_page(struct page *page) { return false; } static inline bool f2fs_is_compress_backend_ready(struct inode *inode) @@ -3951,6 +3958,8 @@ static inline struct page *f2fs_compress_control_page(struct page *page) } static inline int f2fs_init_compress_mempool(void) { return 0; } static inline void f2fs_destroy_compress_mempool(void) { } +static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return 0; } +static inline void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { } #endif static inline void set_compress_context(struct inode *inode) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 426ef4f2eeb8..a7e4c52e9d63 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1282,6 +1282,7 @@ static void f2fs_put_super(struct super_block *sb) kfree(sbi->raw_super); destroy_device_list(sbi); + f2fs_destroy_page_array_cache(sbi); f2fs_destroy_xattr_caches(sbi); mempool_destroy(sbi->write_io_dummy); #ifdef CONFIG_QUOTA @@ -3578,13 +3579,16 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) err = f2fs_init_xattr_caches(sbi); if (err) goto free_io_dummy; + err = f2fs_init_page_array_cache(sbi); + if (err) + goto free_xattr_cache; /* get an inode for meta space */ sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi)); if (IS_ERR(sbi->meta_inode)) { f2fs_err(sbi, "Failed to read F2FS meta data inode"); err = PTR_ERR(sbi->meta_inode); - goto free_xattr_cache; + goto free_page_array_cache; } err = f2fs_get_valid_checkpoint(sbi); @@ -3849,6 +3853,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) make_bad_inode(sbi->meta_inode); iput(sbi->meta_inode); sbi->meta_inode = NULL; +free_page_array_cache: + f2fs_destroy_page_array_cache(sbi); free_xattr_cache: f2fs_destroy_xattr_caches(sbi); free_io_dummy: From 8db54d0238857eaf42ad4fe047771f62bc2322f1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 14 Sep 2020 17:05:14 +0800 Subject: [PATCH 092/580] f2fs: compress: introduce cic/dic slab cache Add two slab caches: "f2fs_cic_entry" and "f2fs_dic_entry" for memory allocation of compress_io_ctx and decompress_io_ctx structure. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 67 +++++++++++++++++++++++++++++++++++++++++----- fs/f2fs/f2fs.h | 4 +++ fs/f2fs/super.c | 6 +++++ 3 files changed, 70 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 92a86dca5011..44b39271da0d 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -18,6 +18,9 @@ #include "node.h" #include +static struct kmem_cache *cic_entry_slab; +static struct kmem_cache *dic_entry_slab; + static void *page_array_alloc(struct inode *inode, int nr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -1177,7 +1180,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, fio.version = ni.version; - cic = f2fs_kzalloc(sbi, sizeof(struct compress_io_ctx), GFP_NOFS); + cic = kmem_cache_zalloc(cic_entry_slab, GFP_NOFS); if (!cic) goto out_put_dnode; @@ -1294,7 +1297,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, f2fs_put_page(cc->cpages[i], 1); } out_put_cic: - kfree(cic); + kmem_cache_free(cic_entry_slab, cic); out_put_dnode: f2fs_put_dnode(&dn); out_unlock_op: @@ -1332,7 +1335,7 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page) } page_array_free(cic->inode, cic->rpages, cic->nr_rpages); - kfree(cic); + kmem_cache_free(cic_entry_slab, cic); } static int f2fs_write_raw_pages(struct compress_ctx *cc, @@ -1444,18 +1447,17 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) { - struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); struct decompress_io_ctx *dic; pgoff_t start_idx = start_idx_of_cluster(cc); int i; - dic = f2fs_kzalloc(sbi, sizeof(struct decompress_io_ctx), GFP_NOFS); + dic = kmem_cache_zalloc(dic_entry_slab, GFP_NOFS); if (!dic) return ERR_PTR(-ENOMEM); dic->rpages = page_array_alloc(cc->inode, cc->cluster_size); if (!dic->rpages) { - kfree(dic); + kmem_cache_free(dic_entry_slab, dic); return ERR_PTR(-ENOMEM); } @@ -1520,7 +1522,7 @@ void f2fs_free_dic(struct decompress_io_ctx *dic) } page_array_free(dic->inode, dic->rpages, dic->nr_rpages); - kfree(dic); + kmem_cache_free(dic_entry_slab, dic); } void f2fs_decompress_end_io(struct page **rpages, @@ -1570,3 +1572,54 @@ void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { kmem_cache_destroy(sbi->page_array_slab); } + +static int __init f2fs_init_cic_cache(void) +{ + cic_entry_slab = f2fs_kmem_cache_create("f2fs_cic_entry", + sizeof(struct compress_io_ctx)); + if (!cic_entry_slab) + return -ENOMEM; + return 0; +} + +static void f2fs_destroy_cic_cache(void) +{ + kmem_cache_destroy(cic_entry_slab); +} + +static int __init f2fs_init_dic_cache(void) +{ + dic_entry_slab = f2fs_kmem_cache_create("f2fs_dic_entry", + sizeof(struct decompress_io_ctx)); + if (!dic_entry_slab) + return -ENOMEM; + return 0; +} + +static void f2fs_destroy_dic_cache(void) +{ + kmem_cache_destroy(dic_entry_slab); +} + +int __init f2fs_init_compress_cache(void) +{ + int err; + + err = f2fs_init_cic_cache(); + if (err) + goto out; + err = f2fs_init_dic_cache(); + if (err) + goto free_cic; + return 0; +free_cic: + f2fs_destroy_cic_cache(); +out: + return -ENOMEM; +} + +void f2fs_destroy_compress_cache(void) +{ + f2fs_destroy_dic_cache(); + f2fs_destroy_cic_cache(); +} diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 382185d91270..c9bce3744043 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3942,6 +3942,8 @@ void f2fs_destroy_compress_ctx(struct compress_ctx *cc); void f2fs_init_compress_info(struct f2fs_sb_info *sbi); int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi); void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi); +int __init f2fs_init_compress_cache(void); +void f2fs_destroy_compress_cache(void); #else static inline bool f2fs_is_compressed_page(struct page *page) { return false; } static inline bool f2fs_is_compress_backend_ready(struct inode *inode) @@ -3960,6 +3962,8 @@ static inline int f2fs_init_compress_mempool(void) { return 0; } static inline void f2fs_destroy_compress_mempool(void) { } static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { } +static inline int __init f2fs_init_compress_cache(void) { return 0; } +static inline void f2fs_destroy_compress_cache(void) { } #endif static inline void set_compress_context(struct inode *inode) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a7e4c52e9d63..3399728002bc 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4001,7 +4001,12 @@ static int __init init_f2fs_fs(void) err = f2fs_init_compress_mempool(); if (err) goto free_bioset; + err = f2fs_init_compress_cache(); + if (err) + goto free_compress_mempool; return 0; +free_compress_mempool: + f2fs_destroy_compress_mempool(); free_bioset: f2fs_destroy_bioset(); free_bio_enrty_cache: @@ -4033,6 +4038,7 @@ static int __init init_f2fs_fs(void) static void __exit exit_f2fs_fs(void) { + f2fs_destroy_compress_cache(); f2fs_destroy_compress_mempool(); f2fs_destroy_bioset(); f2fs_destroy_bio_entry_cache(); From d29b97f7eab3d30dc54cba81a16f856afa95392d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 18 Sep 2020 11:03:49 +0800 Subject: [PATCH 093/580] f2fs: compress: fix to disallow enabling compress on non-empty file Compressed inode and normal inode has different layout, so we should disallow enabling compress on non-empty file to avoid race condition during inode .i_addr array parsing and updating. Signed-off-by: Chao Yu [Jaegeuk Kim: Fix missing condition] Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 8d9b4eb07806..1a6bdf8d3c4b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1843,6 +1843,8 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) if (iflags & F2FS_COMPR_FL) { if (!f2fs_may_compress(inode)) return -EINVAL; + if (S_ISREG(inode->i_mode) && inode->i_size) + return -EINVAL; set_compress_context(inode); } From 62cc2c613385371c06a99de6a30dd2cc251d5b5d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 23 Sep 2020 00:54:50 -0700 Subject: [PATCH 094/580] f2fs: fix slab leak of rpages pointer This fixes the below mem leak. [ 130.157600] ============================================================================= [ 130.159662] BUG f2fs_page_array_entry-252:16 (Tainted: G W O ): Objects remaining in f2fs_page_array_entry-252:16 on __kmem_cache_shutdown() [ 130.162742] ----------------------------------------------------------------------------- [ 130.162742] [ 130.164979] Disabling lock debugging due to kernel taint [ 130.166188] INFO: Slab 0x000000009f5a52d2 objects=22 used=4 fp=0x00000000ba72c3e9 flags=0xfffffc0010200 [ 130.168269] CPU: 7 PID: 3560 Comm: umount Tainted: G B W O 5.9.0-rc4+ #35 [ 130.170019] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014 [ 130.171941] Call Trace: [ 130.172528] dump_stack+0x74/0x9a [ 130.173298] slab_err+0xb7/0xdc [ 130.174044] ? kernel_poison_pages+0xc0/0xc0 [ 130.175065] ? on_each_cpu_cond_mask+0x48/0x90 [ 130.176096] __kmem_cache_shutdown.cold+0x34/0x141 [ 130.177190] kmem_cache_destroy+0x59/0x100 [ 130.178223] f2fs_destroy_page_array_cache+0x15/0x20 [f2fs] [ 130.179527] f2fs_put_super+0x1bc/0x380 [f2fs] [ 130.180538] generic_shutdown_super+0x72/0x110 [ 130.181547] kill_block_super+0x27/0x50 [ 130.182438] kill_f2fs_super+0x76/0xe0 [f2fs] [ 130.183448] deactivate_locked_super+0x3b/0x80 [ 130.184456] deactivate_super+0x3e/0x50 [ 130.185363] cleanup_mnt+0x109/0x160 [ 130.186179] __cleanup_mnt+0x12/0x20 [ 130.187003] task_work_run+0x70/0xb0 [ 130.187841] exit_to_user_mode_prepare+0x18f/0x1b0 [ 130.188917] syscall_exit_to_user_mode+0x31/0x170 [ 130.189989] do_syscall_64+0x45/0x90 [ 130.190828] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 130.191986] RIP: 0033:0x7faf868ea2eb [ 130.192815] Code: 7b 0c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 90 f3 0f 1e fa 31 f6 e9 05 00 00 00 0f 1f 44 00 00 f3 0f 1e fa b8 a6 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 75 7b 0c 00 f7 d8 64 89 01 [ 130.196872] RSP: 002b:00007fffb7edb478 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 [ 130.198494] RAX: 0000000000000000 RBX: 00007faf86a18204 RCX: 00007faf868ea2eb [ 130.201021] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 000055971df71c50 [ 130.203415] RBP: 000055971df71a40 R08: 0000000000000000 R09: 00007fffb7eda1f0 [ 130.205772] R10: 00007faf86a04339 R11: 0000000000000246 R12: 000055971df71c50 [ 130.208150] R13: 0000000000000000 R14: 000055971df71b38 R15: 0000000000000000 [ 130.210515] INFO: Object 0x00000000a980843a @offset=744 [ 130.212476] INFO: Allocated in page_array_alloc+0x3d/0xe0 [f2fs] age=1572 cpu=0 pid=3297 [ 130.215030] __slab_alloc+0x20/0x40 [ 130.216566] kmem_cache_alloc+0x2a0/0x2e0 [ 130.218217] page_array_alloc+0x3d/0xe0 [f2fs] [ 130.219940] f2fs_init_compress_ctx+0x1f/0x40 [f2fs] [ 130.221736] f2fs_write_cache_pages+0x3db/0x860 [f2fs] [ 130.223591] f2fs_write_data_pages+0x2c9/0x300 [f2fs] [ 130.225414] do_writepages+0x43/0xd0 [ 130.226907] __filemap_fdatawrite_range+0xd5/0x110 [ 130.228632] filemap_write_and_wait_range+0x48/0xb0 [ 130.230336] __generic_file_write_iter+0x18a/0x1d0 [ 130.232035] f2fs_file_write_iter+0x226/0x550 [f2fs] [ 130.233737] new_sync_write+0x113/0x1a0 [ 130.235204] vfs_write+0x1a6/0x200 [ 130.236579] ksys_write+0x67/0xe0 [ 130.237898] __x64_sys_write+0x1a/0x20 [ 130.239309] do_syscall_64+0x38/0x90 Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 2 +- fs/f2fs/data.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 44b39271da0d..20643ac3638a 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -158,7 +158,7 @@ struct page *f2fs_compress_control_page(struct page *page) int f2fs_init_compress_ctx(struct compress_ctx *cc) { - if (cc->nr_rpages) + if (cc->rpages) return 0; cc->rpages = page_array_alloc(cc->inode, cc->cluster_size); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 2498909bca05..a96107462990 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3083,6 +3083,8 @@ static int f2fs_write_cache_pages(struct address_space *mapping, retry = 0; } } + if (f2fs_compressed_file(inode)) + f2fs_destroy_compress_ctx(&cc); #endif if (retry) { index = 0; From 043b7c8b8bc9c542f17782020a0e1352e65dab33 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 7 Oct 2020 11:14:35 -0700 Subject: [PATCH 095/580] f2fs: fix memory alignment to support 32bit In 32bit system, 64-bits key breaks memory alignment. This fixes the commit "f2fs: support 64-bits key in f2fs rb-tree node entry". Reported-by: Nicolas Chauvet Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c9bce3744043..e37cb1ddbf82 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -618,7 +618,7 @@ struct rb_entry { unsigned int len; /* length of the entry */ }; unsigned long long key; /* 64-bits key */ - }; + } __packed; }; struct extent_info { From b346ed25b60332b82150e454391f162f6e3947c1 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 8 Oct 2020 12:15:22 -0700 Subject: [PATCH 096/580] f2fs: reject CASEFOLD inode flag without casefold feature syzbot reported: general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f] CPU: 0 PID: 6860 Comm: syz-executor835 Not tainted 5.9.0-rc8-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:utf8_casefold+0x43/0x1b0 fs/unicode/utf8-core.c:107 [...] Call Trace: f2fs_init_casefolded_name fs/f2fs/dir.c:85 [inline] __f2fs_setup_filename fs/f2fs/dir.c:118 [inline] f2fs_prepare_lookup+0x3bf/0x640 fs/f2fs/dir.c:163 f2fs_lookup+0x10d/0x920 fs/f2fs/namei.c:494 __lookup_hash+0x115/0x240 fs/namei.c:1445 filename_create+0x14b/0x630 fs/namei.c:3467 user_path_create fs/namei.c:3524 [inline] do_mkdirat+0x56/0x310 fs/namei.c:3664 do_syscall_64+0x31/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 [...] The problem is that an inode has F2FS_CASEFOLD_FL set, but the filesystem doesn't have the casefold feature flag set, and therefore super_block::s_encoding is NULL. Fix this by making sanity_check_inode() reject inodes that have F2FS_CASEFOLD_FL when the filesystem doesn't have the casefold feature. Reported-by: syzbot+05139c4039d0679e19ff@syzkaller.appspotmail.com Fixes: 2c2eb7a300cd ("f2fs: Support case-insensitive file name lookups") Signed-off-by: Eric Biggers Reviewed-by: Gabriel Krisman Bertazi Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 05347c944422..2be85afa96c7 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -287,6 +287,13 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } + if ((fi->i_flags & F2FS_CASEFOLD_FL) && !f2fs_sb_has_casefold(sbi)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: inode (ino=%lx) has casefold flag, but casefold feature is off", + __func__, inode->i_ino); + return false; + } + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_compression(sbi) && fi->i_flags & F2FS_COMPR_FL && F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, From a1ac940fd8f9d4472df968d739edc8593622e33f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 9 Oct 2020 10:40:48 +0800 Subject: [PATCH 097/580] f2fs: fix to set SBI_NEED_FSCK flag for inconsistent inode If compressed inode has inconsistent fields on i_compress_algorithm, i_compr_blocks and i_log_cluster_size, we missed to set SBI_NEED_FSCK to notice fsck to repair the inode, fix it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 2be85afa96c7..3c13f88d8139 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -299,6 +299,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_log_cluster_size)) { if (ri->i_compress_algorithm >= COMPRESS_MAX) { + set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported " "compress algorithm: %u, run fsck to fix", __func__, inode->i_ino, @@ -307,6 +308,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) } if (le64_to_cpu(ri->i_compr_blocks) > SECTOR_TO_BLOCK(inode->i_blocks)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: inode (ino=%lx) has inconsistent " "i_compr_blocks:%llu, i_blocks:%lu, run fsck to fix", __func__, inode->i_ino, @@ -316,6 +318,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) } if (ri->i_log_cluster_size < MIN_COMPRESS_LOG_SIZE || ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE) { + set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported " "log cluster size: %u, run fsck to fix", __func__, inode->i_ino, From 19ff04ae40ab6dc26532cd96f59939aedc263cb7 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 2 Oct 2020 14:17:35 -0700 Subject: [PATCH 098/580] f2fs: handle errors of f2fs_get_meta_page_nofail First problem is we hit BUG_ON() in f2fs_get_sum_page given EIO on f2fs_get_meta_page_nofail(). Quick fix was not to give any error with infinite loop, but syzbot caught a case where it goes to that loop from fuzzed image. In turned out we abused f2fs_get_meta_page_nofail() like in the below call stack. - f2fs_fill_super - f2fs_build_segment_manager - build_sit_entries - get_current_sit_page INFO: task syz-executor178:6870 can't die for more than 143 seconds. task:syz-executor178 state:R stack:26960 pid: 6870 ppid: 6869 flags:0x00004006 Call Trace: Showing all locks held in the system: 1 lock held by khungtaskd/1179: #0: ffffffff8a554da0 (rcu_read_lock){....}-{1:2}, at: debug_show_all_locks+0x53/0x260 kernel/locking/lockdep.c:6242 1 lock held by systemd-journal/3920: 1 lock held by in:imklog/6769: #0: ffff88809eebc130 (&f->f_pos_lock){+.+.}-{3:3}, at: __fdget_pos+0xe9/0x100 fs/file.c:930 1 lock held by syz-executor178/6870: #0: ffff8880925120e0 (&type->s_umount_key#47/1){+.+.}-{3:3}, at: alloc_super+0x201/0xaf0 fs/super.c:229 Actually, we didn't have to use _nofail in this case, since we could return error to mount(2) already with the error handler. As a result, this patch tries to 1) remove _nofail callers as much as possible, 2) deal with error case in last remaining caller, f2fs_get_sum_page(). Reported-by: syzbot+ee250ac8137be41d7b13@syzkaller.appspotmail.com Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/f2fs.h | 2 +- fs/f2fs/node.c | 2 +- fs/f2fs/segment.c | 12 +++++++++--- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f18386d30f03..023462e80e58 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -107,7 +107,7 @@ struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) return __get_meta_page(sbi, index, true); } -struct page *f2fs_get_meta_page_nofail(struct f2fs_sb_info *sbi, pgoff_t index) +struct page *f2fs_get_meta_page_retry(struct f2fs_sb_info *sbi, pgoff_t index) { struct page *page; int count = 0; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e37cb1ddbf82..e85d85a735f3 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3425,7 +3425,7 @@ unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io); struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); -struct page *f2fs_get_meta_page_nofail(struct f2fs_sb_info *sbi, pgoff_t index); +struct page *f2fs_get_meta_page_retry(struct f2fs_sb_info *sbi, pgoff_t index); struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index); bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index c9c146a43931..becd2b289249 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -109,7 +109,7 @@ static void clear_node_page_dirty(struct page *page) static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid) { - return f2fs_get_meta_page_nofail(sbi, current_nat_addr(sbi, nid)); + return f2fs_get_meta_page(sbi, current_nat_addr(sbi, nid)); } static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 23f0cc300484..962386709149 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2378,7 +2378,9 @@ int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) */ struct page *f2fs_get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno) { - return f2fs_get_meta_page_nofail(sbi, GET_SUM_BLOCK(sbi, segno)); + if (unlikely(f2fs_cp_error(sbi))) + return ERR_PTR(-EIO); + return f2fs_get_meta_page_retry(sbi, GET_SUM_BLOCK(sbi, segno)); } void f2fs_update_meta_page(struct f2fs_sb_info *sbi, @@ -2668,7 +2670,11 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush) __next_free_blkoff(sbi, curseg, 0); sum_page = f2fs_get_sum_page(sbi, new_segno); - f2fs_bug_on(sbi, IS_ERR(sum_page)); + if (IS_ERR(sum_page)) { + /* GC won't be able to use stale summary pages by cp_error */ + memset(curseg->sum_blk, 0, SUM_ENTRY_SIZE); + return; + } sum_node = (struct f2fs_summary_block *)page_address(sum_page); memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); f2fs_put_page(sum_page, 1); @@ -3963,7 +3969,7 @@ int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, static struct page *get_current_sit_page(struct f2fs_sb_info *sbi, unsigned int segno) { - return f2fs_get_meta_page_nofail(sbi, current_sit_addr(sbi, segno)); + return f2fs_get_meta_page(sbi, current_sit_addr(sbi, segno)); } static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, From 94de2d7aad56d5dfa8e9ee5449024576dac558df Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 12 Oct 2020 10:28:14 +0800 Subject: [PATCH 099/580] f2fs: don't issue flush in f2fs_flush_device_cache() for nobarrier case This patch changes f2fs_flush_device_cache() to skip issuing flush for nobarrier case. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 962386709149..416785766b4c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -759,6 +759,9 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi) if (!f2fs_is_multi_device(sbi)) return 0; + if (test_opt(sbi, NOBARRIER)) + return 0; + for (i = 1; i < sbi->s_ndevs; i++) { if (!f2fs_test_bit(i, (char *)&sbi->dirty_device)) continue; From 17c3e9a89d24748a13dba88559aa16fc3c64514b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 12 Oct 2020 17:06:05 +0800 Subject: [PATCH 100/580] f2fs: introduce check_swap_activate_fast() check_swap_activate() will lookup block mapping via bmap() one by one, so its performance is very bad, this patch introduces check_swap_activate_fast() to use f2fs_fiemap() to boost this process, since f2fs_fiemap() will lookup block mappings in batch, therefore, it can improve swapon()'s performance significantly. Note that this enhancement only works when page size is equal to f2fs' block size. Testcase: (backend device: zram) - touch file - pin & fallocate file to 8GB - mkswap file - swapon file Before: real 0m2.999s user 0m0.000s sys 0m2.980s After: real 0m0.081s user 0m0.000s sys 0m0.064s Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a96107462990..6440c297012f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3837,6 +3837,83 @@ int f2fs_migrate_page(struct address_space *mapping, #endif #ifdef CONFIG_SWAP +static int check_swap_activate_fast(struct swap_info_struct *sis, + struct file *swap_file, sector_t *span) +{ + struct address_space *mapping = swap_file->f_mapping; + struct inode *inode = mapping->host; + sector_t cur_lblock; + sector_t last_lblock; + sector_t pblock; + sector_t lowest_pblock = -1; + sector_t highest_pblock = 0; + int nr_extents = 0; + unsigned long nr_pblocks; + unsigned long len; + int ret; + + /* + * Map all the blocks into the extent list. This code doesn't try + * to be very smart. + */ + cur_lblock = 0; + last_lblock = logical_to_blk(inode, i_size_read(inode)); + len = i_size_read(inode); + + while (cur_lblock <= last_lblock && cur_lblock < sis->max) { + struct buffer_head map_bh; + pgoff_t next_pgofs; + + cond_resched(); + + memset(&map_bh, 0, sizeof(struct buffer_head)); + map_bh.b_size = len - cur_lblock; + + ret = get_data_block(inode, cur_lblock, &map_bh, 0, + F2FS_GET_BLOCK_FIEMAP, &next_pgofs); + if (ret) + goto err_out; + + /* hole */ + if (!buffer_mapped(&map_bh)) + goto err_out; + + pblock = map_bh.b_blocknr; + nr_pblocks = logical_to_blk(inode, map_bh.b_size); + + if (cur_lblock + nr_pblocks >= sis->max) + nr_pblocks = sis->max - cur_lblock; + + if (cur_lblock) { /* exclude the header page */ + if (pblock < lowest_pblock) + lowest_pblock = pblock; + if (pblock + nr_pblocks - 1 > highest_pblock) + highest_pblock = pblock + nr_pblocks - 1; + } + + /* + * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks + */ + ret = add_swap_extent(sis, cur_lblock, nr_pblocks, pblock); + if (ret < 0) + goto out; + nr_extents += ret; + cur_lblock += nr_pblocks; + } + ret = nr_extents; + *span = 1 + highest_pblock - lowest_pblock; + if (cur_lblock == 0) + cur_lblock = 1; /* force Empty message */ + sis->max = cur_lblock; + sis->pages = cur_lblock - 1; + sis->highest_bit = cur_lblock - 1; +out: + return ret; +err_out: + pr_err("swapon: swapfile has holes\n"); + return -EINVAL; +} + /* Copied from generic_swapfile_activate() to check any holes */ static int check_swap_activate(struct swap_info_struct *sis, struct file *swap_file, sector_t *span) @@ -3853,6 +3930,9 @@ static int check_swap_activate(struct swap_info_struct *sis, int nr_extents = 0; int ret; + if (PAGE_SIZE == F2FS_BLKSIZE) + return check_swap_activate_fast(sis, swap_file, span); + blkbits = inode->i_blkbits; blocks_per_page = PAGE_SIZE >> blkbits; From 462582450a8f8deabce1c48177c5425104c43eb5 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Mon, 12 Oct 2020 13:59:47 +0900 Subject: [PATCH 101/580] f2fs: fix writecount false positive in releasing compress blocks In current condition check, if it detects writecount, it return -EBUSY regardless of f_mode of the file. Fixed it. Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 1a6bdf8d3c4b..a1da4831228d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -3533,7 +3533,8 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) inode_lock(inode); writecount = atomic_read(&inode->i_writecount); - if ((filp->f_mode & FMODE_WRITE && writecount != 1) || writecount) { + if ((filp->f_mode & FMODE_WRITE && writecount != 1) || + (!(filp->f_mode & FMODE_WRITE) && writecount)) { ret = -EBUSY; goto out; } From b1b5de6d73f1b7a6e36b1bb74c8f0bdebecb9d03 Mon Sep 17 00:00:00 2001 From: Jamie Iles Date: Mon, 12 Oct 2020 14:09:48 +0100 Subject: [PATCH 102/580] f2fs: wait for sysfs kobject removal before freeing f2fs_sb_info syzkaller found that with CONFIG_DEBUG_KOBJECT_RELEASE=y, unmounting an f2fs filesystem could result in the following splat: kobject: 'loop5' ((____ptrval____)): kobject_release, parent 0000000000000000 (delayed 250) kobject: 'f2fs_xattr_entry-7:5' ((____ptrval____)): kobject_release, parent 0000000000000000 (delayed 750) ------------[ cut here ]------------ ODEBUG: free active (active state 0) object type: timer_list hint: delayed_work_timer_fn+0x0/0x98 WARNING: CPU: 0 PID: 699 at lib/debugobjects.c:485 debug_print_object+0x180/0x240 Kernel panic - not syncing: panic_on_warn set ... CPU: 0 PID: 699 Comm: syz-executor.5 Tainted: G S 5.9.0-rc8+ #101 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0x0/0x4d8 show_stack+0x34/0x48 dump_stack+0x174/0x1f8 panic+0x360/0x7a0 __warn+0x244/0x2ec report_bug+0x240/0x398 bug_handler+0x50/0xc0 call_break_hook+0x160/0x1d8 brk_handler+0x30/0xc0 do_debug_exception+0x184/0x340 el1_dbg+0x48/0xb0 el1_sync_handler+0x170/0x1c8 el1_sync+0x80/0x100 debug_print_object+0x180/0x240 debug_check_no_obj_freed+0x200/0x430 slab_free_freelist_hook+0x190/0x210 kfree+0x13c/0x460 f2fs_put_super+0x624/0xa58 generic_shutdown_super+0x120/0x300 kill_block_super+0x94/0xf8 kill_f2fs_super+0x244/0x308 deactivate_locked_super+0x104/0x150 deactivate_super+0x118/0x148 cleanup_mnt+0x27c/0x3c0 __cleanup_mnt+0x28/0x38 task_work_run+0x10c/0x248 do_notify_resume+0x9d4/0x1188 work_pending+0x8/0x34c Like the error handling for f2fs_register_sysfs(), we need to wait for the kobject to be destroyed before returning to prevent a potential use-after-free. Fixes: bf9e697ecd42 ("f2fs: expose features to sysfs entry") Cc: Jaegeuk Kim Cc: Chao Yu Signed-off-by: Jamie Iles Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 93b2a91a7b37..64f7860ba3e4 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -973,4 +973,5 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) } kobject_del(&sbi->s_kobj); kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); } From d2af1846e23f4b2ab18cc155327b86111ce07824 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Tue, 13 Oct 2020 06:49:21 +0800 Subject: [PATCH 103/580] f2fs: code cleanup by removing unnecessary check f2fs_seek_block() is only used for regular file, so don't have to check inline dentry in it. Signed-off-by: Chengguang Xu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a1da4831228d..c8b6ab3a6c45 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -429,9 +429,8 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) goto fail; /* handle inline data case */ - if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) { - if (whence == SEEK_HOLE) - data_ofs = isize; + if (f2fs_has_inline_data(inode) && whence == SEEK_HOLE) { + data_ofs = isize; goto found; } From 0f511eb4105ed030b9a0a00fb96f9f26b92720bf Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 23 Sep 2020 22:47:21 -0700 Subject: [PATCH 104/580] fscrypt: export fscrypt_d_revalidate() Dentries that represent no-key names must have a dentry_operations that includes fscrypt_d_revalidate(). Currently, this is handled by fscrypt_prepare_lookup() installing fscrypt_d_ops. However, ceph support for encryption (https://lore.kernel.org/r/20200914191707.380444-1-jlayton@kernel.org) can't use fscrypt_d_ops, since ceph already has its own dentry_operations. Similarly, ext4 and f2fs support for directories that are both encrypted and casefolded (https://lore.kernel.org/r/20200923010151.69506-1-drosen@google.com) can't use fscrypt_d_ops either, since casefolding requires some dentry operations too. To satisfy both users, we need to move the responsibility of installing the dentry_operations to filesystems. In preparation for this, export fscrypt_d_revalidate() and give it a !CONFIG_FS_ENCRYPTION stub. Reviewed-by: Jeff Layton Link: https://lore.kernel.org/r/20200924054721.187797-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- fs/crypto/fname.c | 3 ++- include/linux/fscrypt.h | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 5e0c24415583..f4adbcc25678 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -568,7 +568,7 @@ EXPORT_SYMBOL_GPL(fscrypt_fname_siphash); * Validate dentries in encrypted directories to make sure we aren't potentially * caching stale dentries after a key has been added. */ -static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) +int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) { struct dentry *dir; int err; @@ -607,6 +607,7 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) return valid; } +EXPORT_SYMBOL_GPL(fscrypt_d_revalidate); const struct dentry_operations fscrypt_d_ops = { .d_revalidate = fscrypt_d_revalidate, diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 48c2869fea26..a2e145bd889e 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -199,6 +199,7 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode, bool fscrypt_match_name(const struct fscrypt_name *fname, const u8 *de_name, u32 de_name_len); u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name); +int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags); /* bio.c */ void fscrypt_decrypt_bio(struct bio *bio); @@ -451,6 +452,12 @@ static inline u64 fscrypt_fname_siphash(const struct inode *dir, return 0; } +static inline int fscrypt_d_revalidate(struct dentry *dentry, + unsigned int flags) +{ + return 1; +} + /* bio.c */ static inline void fscrypt_decrypt_bio(struct bio *bio) { From c61bae0801e82a9504a8301ade502b0abda9c2f2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 23 Sep 2020 21:26:23 -0700 Subject: [PATCH 105/580] fscrypt: don't call no-key names "ciphertext names" Currently we're using the term "ciphertext name" ambiguously because it can mean either the actual ciphertext filename, or the encoded filename that is shown when an encrypted directory is listed without its key. The latter we're now usually calling the "no-key name"; and while it's derived from the ciphertext name, it's not the same thing. To avoid this ambiguity, rename fscrypt_name::is_ciphertext_name to fscrypt_name::is_nokey_name, and update comments that say "ciphertext name" (or "encrypted name") to say "no-key name" instead when warranted. Link: https://lore.kernel.org/r/20200924042624.98439-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- fs/crypto/fname.c | 16 ++++++++-------- fs/crypto/hooks.c | 6 +++--- fs/f2fs/dir.c | 2 +- include/linux/fscrypt.h | 15 +++++++-------- 4 files changed, 19 insertions(+), 20 deletions(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index f4adbcc25678..eb429b5afb6a 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -420,9 +420,9 @@ EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); * directory's encryption key, then @iname is the plaintext, so we encrypt it to * get the disk_name. * - * Else, for keyless @lookup operations, @iname is the presented ciphertext, so - * we decode it to get the fscrypt_nokey_name. Non-@lookup operations will be - * impossible in this case, so we fail them with ENOKEY. + * Else, for keyless @lookup operations, @iname should be a no-key name, so we + * decode it to get the struct fscrypt_nokey_name. Non-@lookup operations will + * be impossible in this case, so we fail them with ENOKEY. * * If successful, fscrypt_free_filename() must be called later to clean up. * @@ -466,7 +466,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, } if (!lookup) return -ENOKEY; - fname->is_ciphertext_name = true; + fname->is_nokey_name = true; /* * We don't have the key and we are doing a lookup; decode the @@ -576,17 +576,17 @@ int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) /* * Plaintext names are always valid, since fscrypt doesn't support - * reverting to ciphertext names without evicting the directory's inode + * reverting to no-key names without evicting the directory's inode * -- which implies eviction of the dentries in the directory. */ if (!(dentry->d_flags & DCACHE_ENCRYPTED_NAME)) return 1; /* - * Ciphertext name; valid if the directory's key is still unavailable. + * No-key name; valid if the directory's key is still unavailable. * - * Although fscrypt forbids rename() on ciphertext names, we still must - * use dget_parent() here rather than use ->d_parent directly. That's + * Although fscrypt forbids rename() on no-key names, we still must use + * dget_parent() here rather than use ->d_parent directly. That's * because a corrupted fs image may contain directory hard links, which * the VFS handles by moving the directory's dentry tree in the dcache * each time ->lookup() finds the directory and it already has a dentry diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 0b5077e4759a..880d6f6e75bd 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -59,7 +59,7 @@ int __fscrypt_prepare_link(struct inode *inode, struct inode *dir, if (err) return err; - /* ... in case we looked up ciphertext name before key was added */ + /* ... in case we looked up no-key name before key was added */ if (dentry->d_flags & DCACHE_ENCRYPTED_NAME) return -ENOKEY; @@ -84,7 +84,7 @@ int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, if (err) return err; - /* ... in case we looked up ciphertext name(s) before key was added */ + /* ... in case we looked up no-key name(s) before key was added */ if ((old_dentry->d_flags | new_dentry->d_flags) & DCACHE_ENCRYPTED_NAME) return -ENOKEY; @@ -113,7 +113,7 @@ int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, if (err && err != -ENOENT) return err; - if (fname->is_ciphertext_name) { + if (fname->is_nokey_name) { spin_lock(&dentry->d_lock); dentry->d_flags |= DCACHE_ENCRYPTED_NAME; spin_unlock(&dentry->d_lock); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 7808cf512041..80f23315b1a1 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -112,7 +112,7 @@ static int __f2fs_setup_filename(const struct inode *dir, #ifdef CONFIG_FS_ENCRYPTION fname->crypto_buf = crypt_name->crypto_buf; #endif - if (crypt_name->is_ciphertext_name) { + if (crypt_name->is_nokey_name) { /* hash was decoded from the no-key name */ fname->hash = cpu_to_le32(crypt_name->hash); } else { diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index a2e145bd889e..5a095cb18c02 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -36,7 +36,7 @@ struct fscrypt_name { u32 hash; u32 minor_hash; struct fscrypt_str crypto_buf; - bool is_ciphertext_name; + bool is_nokey_name; }; #define FSTR_INIT(n, l) { .name = n, .len = l } @@ -629,17 +629,16 @@ static inline int fscrypt_prepare_rename(struct inode *old_dir, * @fname: (output) the name to use to search the on-disk directory * * Prepare for ->lookup() in a directory which may be encrypted by determining - * the name that will actually be used to search the directory on-disk. Lookups - * can be done with or without the directory's encryption key; without the key, - * filenames are presented in encrypted form. Therefore, we'll try to set up - * the directory's encryption key, but even without it the lookup can continue. + * the name that will actually be used to search the directory on-disk. If the + * directory's encryption key is available, then the lookup is assumed to be by + * plaintext name; otherwise, it is assumed to be by no-key name. * * This also installs a custom ->d_revalidate() method which will invalidate the * dentry if it was created without the key and the key is later added. * - * Return: 0 on success; -ENOENT if key is unavailable but the filename isn't a - * correctly formed encoded ciphertext name, so a negative dentry should be - * created; or another -errno code. + * Return: 0 on success; -ENOENT if the directory's key is unavailable but the + * filename isn't a valid no-key name, so a negative dentry should be created; + * or another -errno code. */ static inline int fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, From eb2969d86340d3992516d937586fd61349b4eefb Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 23 Sep 2020 21:26:24 -0700 Subject: [PATCH 106/580] fscrypt: rename DCACHE_ENCRYPTED_NAME to DCACHE_NOKEY_NAME Originally we used the term "encrypted name" or "ciphertext name" to mean the encoded filename that is shown when an encrypted directory is listed without its key. But these terms are ambiguous since they also mean the filename stored on-disk. "Encrypted name" is especially ambiguous since it could also be understood to mean "this filename is encrypted on-disk", similar to "encrypted file". So we've started calling these encoded names "no-key names" instead. Therefore, rename DCACHE_ENCRYPTED_NAME to DCACHE_NOKEY_NAME to avoid confusion about what this flag means. Link: https://lore.kernel.org/r/20200924042624.98439-3-ebiggers@kernel.org Signed-off-by: Eric Biggers --- fs/crypto/fname.c | 2 +- fs/crypto/hooks.c | 7 +++---- include/linux/dcache.h | 2 +- include/linux/fscrypt.h | 12 ++++++------ 4 files changed, 11 insertions(+), 12 deletions(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index eb429b5afb6a..e50d670350f2 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -579,7 +579,7 @@ int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) * reverting to no-key names without evicting the directory's inode * -- which implies eviction of the dentries in the directory. */ - if (!(dentry->d_flags & DCACHE_ENCRYPTED_NAME)) + if (!(dentry->d_flags & DCACHE_NOKEY_NAME)) return 1; /* diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 880d6f6e75bd..7099065154cb 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -60,7 +60,7 @@ int __fscrypt_prepare_link(struct inode *inode, struct inode *dir, return err; /* ... in case we looked up no-key name before key was added */ - if (dentry->d_flags & DCACHE_ENCRYPTED_NAME) + if (dentry->d_flags & DCACHE_NOKEY_NAME) return -ENOKEY; if (!fscrypt_has_permitted_context(dir, inode)) @@ -85,8 +85,7 @@ int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, return err; /* ... in case we looked up no-key name(s) before key was added */ - if ((old_dentry->d_flags | new_dentry->d_flags) & - DCACHE_ENCRYPTED_NAME) + if ((old_dentry->d_flags | new_dentry->d_flags) & DCACHE_NOKEY_NAME) return -ENOKEY; if (old_dir != new_dir) { @@ -115,7 +114,7 @@ int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, if (fname->is_nokey_name) { spin_lock(&dentry->d_lock); - dentry->d_flags |= DCACHE_ENCRYPTED_NAME; + dentry->d_flags |= DCACHE_NOKEY_NAME; spin_unlock(&dentry->d_lock); d_set_d_op(dentry, &fscrypt_d_ops); } diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 4c6b92d5ac26..41c90460c618 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -211,7 +211,7 @@ struct dentry_operations { #define DCACHE_MAY_FREE 0x00800000 #define DCACHE_FALLTHRU 0x01000000 /* Fall through to lower layer */ -#define DCACHE_ENCRYPTED_NAME 0x02000000 /* Encrypted name (dir key was unavailable) */ +#define DCACHE_NOKEY_NAME 0x02000000 /* Encrypted name encoded without key */ #define DCACHE_OP_REAL 0x04000000 #define DCACHE_PAR_LOOKUP 0x10000000 /* being looked up (with parent locked shared) */ diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 5a095cb18c02..0362699f7c86 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -102,15 +102,15 @@ fscrypt_get_dummy_context(struct super_block *sb) } /* - * When d_splice_alias() moves a directory's encrypted alias to its decrypted - * alias as a result of the encryption key being added, DCACHE_ENCRYPTED_NAME - * must be cleared. Note that we don't have to support arbitrary moves of this - * flag because fscrypt doesn't allow encrypted aliases to be the source or - * target of a rename(). + * When d_splice_alias() moves a directory's no-key alias to its plaintext alias + * as a result of the encryption key being added, DCACHE_NOKEY_NAME must be + * cleared. Note that we don't have to support arbitrary moves of this flag + * because fscrypt doesn't allow no-key names to be the source or target of a + * rename(). */ static inline void fscrypt_handle_d_move(struct dentry *dentry) { - dentry->d_flags &= ~DCACHE_ENCRYPTED_NAME; + dentry->d_flags &= ~DCACHE_NOKEY_NAME; } /* crypto.c */ From eb12632312d6e3cbcd96fc71ed4a65762c192ced Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sun, 25 Oct 2020 07:35:47 -0700 Subject: [PATCH 107/580] f2fs: call f2fs_get_meta_page_retry for nat page When running fault injection test, if we don't stop checkpoint, some stale NAT entries were flushed which breaks consistency. Fixes: 86f33603f8c5 ("f2fs: handle errors of f2fs_get_meta_page_nofail") Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index becd2b289249..cfab2447059c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -109,7 +109,7 @@ static void clear_node_page_dirty(struct page *page) static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid) { - return f2fs_get_meta_page(sbi, current_nat_addr(sbi, nid)); + return f2fs_get_meta_page_retry(sbi, current_nat_addr(sbi, nid)); } static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) From 7c71b5495d026ba7719e5f127711c30ca2af4047 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 2 Nov 2020 17:36:58 +0800 Subject: [PATCH 108/580] f2fs: fix to seek incorrect data offset in inline data file As kitestramuort reported: F2FS-fs (nvme0n1p4): access invalid blkaddr:1598541474 [ 25.725898] ------------[ cut here ]------------ [ 25.725903] WARNING: CPU: 6 PID: 2018 at f2fs_is_valid_blkaddr+0x23a/0x250 [ 25.725923] Call Trace: [ 25.725927] ? f2fs_llseek+0x204/0x620 [ 25.725929] ? ovl_copy_up_data+0x14f/0x200 [ 25.725931] ? ovl_copy_up_inode+0x174/0x1e0 [ 25.725933] ? ovl_copy_up_one+0xa22/0xdf0 [ 25.725936] ? ovl_copy_up_flags+0xa6/0xf0 [ 25.725938] ? ovl_aio_cleanup_handler+0xd0/0xd0 [ 25.725939] ? ovl_maybe_copy_up+0x86/0xa0 [ 25.725941] ? ovl_open+0x22/0x80 [ 25.725943] ? do_dentry_open+0x136/0x350 [ 25.725945] ? path_openat+0xb7e/0xf40 [ 25.725947] ? __check_sticky+0x40/0x40 [ 25.725948] ? do_filp_open+0x70/0x100 [ 25.725950] ? __check_sticky+0x40/0x40 [ 25.725951] ? __check_sticky+0x40/0x40 [ 25.725953] ? __x64_sys_openat+0x1db/0x2c0 [ 25.725955] ? do_syscall_64+0x2d/0x40 [ 25.725957] ? entry_SYSCALL_64_after_hwframe+0x44/0xa9 llseek() reports invalid block address access, the root cause is if file has inline data, f2fs_seek_block() will access inline data regard as block address index in inode block, which should be wrong, fix it. Reported-by: kitestramuort Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c8b6ab3a6c45..b06d68fcd293 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -429,9 +429,14 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) goto fail; /* handle inline data case */ - if (f2fs_has_inline_data(inode) && whence == SEEK_HOLE) { - data_ofs = isize; - goto found; + if (f2fs_has_inline_data(inode)) { + if (whence == SEEK_HOLE) { + data_ofs = isize; + goto found; + } else if (whence == SEEK_DATA) { + data_ofs = offset; + goto found; + } } pgofs = (pgoff_t)(offset >> PAGE_SHIFT); From a67a570930f9aeb6bf75f06ac4495050ca97659f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 2 Nov 2020 14:21:31 +0800 Subject: [PATCH 109/580] f2fs: move ioctl interface definitions to separated file Like other filesystem does, we introduce a new file f2fs.h in path of include/uapi/linux/, and move f2fs-specified ioctl interface definitions to that file, after then, in order to use those definitions, userspace developer only need to include the new header file rather than copy & paste definitions from fs/f2fs/f2fs.h. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- MAINTAINERS | 1 + fs/f2fs/f2fs.h | 79 --------------------------------- fs/f2fs/file.c | 1 + include/trace/events/f2fs.h | 1 + include/uapi/linux/f2fs.h | 87 +++++++++++++++++++++++++++++++++++++ 5 files changed, 90 insertions(+), 79 deletions(-) create mode 100644 include/uapi/linux/f2fs.h diff --git a/MAINTAINERS b/MAINTAINERS index bd16551dfebf..1f011aac6d2b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5595,6 +5595,7 @@ F: Documentation/ABI/testing/sysfs-fs-f2fs F: fs/f2fs/ F: include/linux/f2fs_fs.h F: include/trace/events/f2fs.h +F: include/uapi/linux/f2fs.h F71805F HARDWARE MONITORING DRIVER M: Jean Delvare diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e85d85a735f3..14ba62a55bca 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -401,85 +401,6 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, return size <= MAX_SIT_JENTRIES(journal); } -/* - * f2fs-specific ioctl commands - */ -#define F2FS_IOCTL_MAGIC 0xf5 -#define F2FS_IOC_START_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 1) -#define F2FS_IOC_COMMIT_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 2) -#define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3) -#define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4) -#define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) -#define F2FS_IOC_GARBAGE_COLLECT _IOW(F2FS_IOCTL_MAGIC, 6, __u32) -#define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7) -#define F2FS_IOC_DEFRAGMENT _IOWR(F2FS_IOCTL_MAGIC, 8, \ - struct f2fs_defragment) -#define F2FS_IOC_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \ - struct f2fs_move_range) -#define F2FS_IOC_FLUSH_DEVICE _IOW(F2FS_IOCTL_MAGIC, 10, \ - struct f2fs_flush_device) -#define F2FS_IOC_GARBAGE_COLLECT_RANGE _IOW(F2FS_IOCTL_MAGIC, 11, \ - struct f2fs_gc_range) -#define F2FS_IOC_GET_FEATURES _IOR(F2FS_IOCTL_MAGIC, 12, __u32) -#define F2FS_IOC_SET_PIN_FILE _IOW(F2FS_IOCTL_MAGIC, 13, __u32) -#define F2FS_IOC_GET_PIN_FILE _IOR(F2FS_IOCTL_MAGIC, 14, __u32) -#define F2FS_IOC_PRECACHE_EXTENTS _IO(F2FS_IOCTL_MAGIC, 15) -#define F2FS_IOC_RESIZE_FS _IOW(F2FS_IOCTL_MAGIC, 16, __u64) -#define F2FS_IOC_GET_COMPRESS_BLOCKS _IOR(F2FS_IOCTL_MAGIC, 17, __u64) -#define F2FS_IOC_RELEASE_COMPRESS_BLOCKS \ - _IOR(F2FS_IOCTL_MAGIC, 18, __u64) -#define F2FS_IOC_RESERVE_COMPRESS_BLOCKS \ - _IOR(F2FS_IOCTL_MAGIC, 19, __u64) -#define F2FS_IOC_SEC_TRIM_FILE _IOW(F2FS_IOCTL_MAGIC, 20, \ - struct f2fs_sectrim_range) - -/* - * should be same as XFS_IOC_GOINGDOWN. - * Flags for going down operation used by FS_IOC_GOINGDOWN - */ -#define F2FS_IOC_SHUTDOWN _IOR('X', 125, __u32) /* Shutdown */ -#define F2FS_GOING_DOWN_FULLSYNC 0x0 /* going down with full sync */ -#define F2FS_GOING_DOWN_METASYNC 0x1 /* going down with metadata */ -#define F2FS_GOING_DOWN_NOSYNC 0x2 /* going down */ -#define F2FS_GOING_DOWN_METAFLUSH 0x3 /* going down with meta flush */ -#define F2FS_GOING_DOWN_NEED_FSCK 0x4 /* going down to trigger fsck */ - -/* - * Flags used by F2FS_IOC_SEC_TRIM_FILE - */ -#define F2FS_TRIM_FILE_DISCARD 0x1 /* send discard command */ -#define F2FS_TRIM_FILE_ZEROOUT 0x2 /* zero out */ -#define F2FS_TRIM_FILE_MASK 0x3 - -struct f2fs_gc_range { - u32 sync; - u64 start; - u64 len; -}; - -struct f2fs_defragment { - u64 start; - u64 len; -}; - -struct f2fs_move_range { - u32 dst_fd; /* destination fd */ - u64 pos_in; /* start position in src_fd */ - u64 pos_out; /* start position in dst_fd */ - u64 len; /* size to move */ -}; - -struct f2fs_flush_device { - u32 dev_num; /* device number to flush */ - u32 segments; /* # of segments to flush */ -}; - -struct f2fs_sectrim_range { - u64 start; - u64 len; - u64 flags; -}; - /* for inline stuff */ #define DEF_INLINE_RESERVED_SIZE 1 static inline int get_extra_isize(struct inode *inode); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b06d68fcd293..aa8a19b249db 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -31,6 +31,7 @@ #include "gc.h" #include "trace.h" #include +#include static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf) { diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index ca01996fca66..4d72bf206fd4 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -6,6 +6,7 @@ #define _TRACE_F2FS_H #include +#include #define show_dev(dev) MAJOR(dev), MINOR(dev) #define show_dev_ino(entry) show_dev(entry->dev), (unsigned long)entry->ino diff --git a/include/uapi/linux/f2fs.h b/include/uapi/linux/f2fs.h new file mode 100644 index 000000000000..28bcfe8d2c27 --- /dev/null +++ b/include/uapi/linux/f2fs.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ + +#ifndef _UAPI_LINUX_F2FS_H +#define _UAPI_LINUX_F2FS_H +#include +#include + +/* + * f2fs-specific ioctl commands + */ +#define F2FS_IOCTL_MAGIC 0xf5 +#define F2FS_IOC_START_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 1) +#define F2FS_IOC_COMMIT_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 2) +#define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3) +#define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4) +#define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) +#define F2FS_IOC_GARBAGE_COLLECT _IOW(F2FS_IOCTL_MAGIC, 6, __u32) +#define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7) +#define F2FS_IOC_DEFRAGMENT _IOWR(F2FS_IOCTL_MAGIC, 8, \ + struct f2fs_defragment) +#define F2FS_IOC_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \ + struct f2fs_move_range) +#define F2FS_IOC_FLUSH_DEVICE _IOW(F2FS_IOCTL_MAGIC, 10, \ + struct f2fs_flush_device) +#define F2FS_IOC_GARBAGE_COLLECT_RANGE _IOW(F2FS_IOCTL_MAGIC, 11, \ + struct f2fs_gc_range) +#define F2FS_IOC_GET_FEATURES _IOR(F2FS_IOCTL_MAGIC, 12, __u32) +#define F2FS_IOC_SET_PIN_FILE _IOW(F2FS_IOCTL_MAGIC, 13, __u32) +#define F2FS_IOC_GET_PIN_FILE _IOR(F2FS_IOCTL_MAGIC, 14, __u32) +#define F2FS_IOC_PRECACHE_EXTENTS _IO(F2FS_IOCTL_MAGIC, 15) +#define F2FS_IOC_RESIZE_FS _IOW(F2FS_IOCTL_MAGIC, 16, __u64) +#define F2FS_IOC_GET_COMPRESS_BLOCKS _IOR(F2FS_IOCTL_MAGIC, 17, __u64) +#define F2FS_IOC_RELEASE_COMPRESS_BLOCKS \ + _IOR(F2FS_IOCTL_MAGIC, 18, __u64) +#define F2FS_IOC_RESERVE_COMPRESS_BLOCKS \ + _IOR(F2FS_IOCTL_MAGIC, 19, __u64) +#define F2FS_IOC_SEC_TRIM_FILE _IOW(F2FS_IOCTL_MAGIC, 20, \ + struct f2fs_sectrim_range) + +/* + * should be same as XFS_IOC_GOINGDOWN. + * Flags for going down operation used by FS_IOC_GOINGDOWN + */ +#define F2FS_IOC_SHUTDOWN _IOR('X', 125, __u32) /* Shutdown */ +#define F2FS_GOING_DOWN_FULLSYNC 0x0 /* going down with full sync */ +#define F2FS_GOING_DOWN_METASYNC 0x1 /* going down with metadata */ +#define F2FS_GOING_DOWN_NOSYNC 0x2 /* going down */ +#define F2FS_GOING_DOWN_METAFLUSH 0x3 /* going down with meta flush */ +#define F2FS_GOING_DOWN_NEED_FSCK 0x4 /* going down to trigger fsck */ + +/* + * Flags used by F2FS_IOC_SEC_TRIM_FILE + */ +#define F2FS_TRIM_FILE_DISCARD 0x1 /* send discard command */ +#define F2FS_TRIM_FILE_ZEROOUT 0x2 /* zero out */ +#define F2FS_TRIM_FILE_MASK 0x3 + +struct f2fs_gc_range { + __u32 sync; + __u64 start; + __u64 len; +}; + +struct f2fs_defragment { + __u64 start; + __u64 len; +}; + +struct f2fs_move_range { + __u32 dst_fd; /* destination fd */ + __u64 pos_in; /* start position in src_fd */ + __u64 pos_out; /* start position in dst_fd */ + __u64 len; /* size to move */ +}; + +struct f2fs_flush_device { + __u32 dev_num; /* device number to flush */ + __u32 segments; /* # of segments to flush */ +}; + +struct f2fs_sectrim_range { + __u64 start; + __u64 len; + __u64 flags; +}; + +#endif /* _UAPI_LINUX_F2FS_H */ From a3cc036c668e757b3b3a2516385a42aa309091e7 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Fri, 30 Oct 2020 13:10:34 +0900 Subject: [PATCH 110/580] f2fs: add F2FS_IOC_GET_COMPRESS_OPTION ioctl Added a new F2FS_IOC_GET_COMPRESS_OPTION ioctl to get file compression option of a file. struct f2fs_comp_option { u8 algorithm; => compression algorithm => 0:lzo, 1:lz4, 2:zstd, 3:lzorle u8 log_cluster_size; => log scale cluster size => 2 ~ 8 }; struct f2fs_comp_option option; ioctl(fd, F2FS_IOC_GET_COMPRESS_OPTION, &option); Signed-off-by: Daeho Jeong Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 30 ++++++++++++++++++++++++++++++ include/uapi/linux/f2fs.h | 7 +++++++ 2 files changed, 37 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index aa8a19b249db..90b502bf2a39 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -3967,6 +3967,33 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg) return ret; } +static int f2fs_ioc_get_compress_option(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_comp_option option; + + if (!f2fs_sb_has_compression(F2FS_I_SB(inode))) + return -EOPNOTSUPP; + + inode_lock_shared(inode); + + if (!f2fs_compressed_file(inode)) { + inode_unlock_shared(inode); + return -ENODATA; + } + + option.algorithm = F2FS_I(inode)->i_compress_algorithm; + option.log_cluster_size = F2FS_I(inode)->i_log_cluster_size; + + inode_unlock_shared(inode); + + if (copy_to_user((struct f2fs_comp_option __user *)arg, &option, + sizeof(option))) + return -EFAULT; + + return 0; +} + long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp))))) @@ -4055,6 +4082,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_reserve_compress_blocks(filp, arg); case F2FS_IOC_SEC_TRIM_FILE: return f2fs_sec_trim_file(filp, arg); + case F2FS_IOC_GET_COMPRESS_OPTION: + return f2fs_ioc_get_compress_option(filp, arg); default: return -ENOTTY; } @@ -4224,6 +4253,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_RELEASE_COMPRESS_BLOCKS: case F2FS_IOC_RESERVE_COMPRESS_BLOCKS: case F2FS_IOC_SEC_TRIM_FILE: + case F2FS_IOC_GET_COMPRESS_OPTION: break; default: return -ENOIOCTLCMD; diff --git a/include/uapi/linux/f2fs.h b/include/uapi/linux/f2fs.h index 28bcfe8d2c27..872e61d78f29 100644 --- a/include/uapi/linux/f2fs.h +++ b/include/uapi/linux/f2fs.h @@ -36,6 +36,8 @@ _IOR(F2FS_IOCTL_MAGIC, 19, __u64) #define F2FS_IOC_SEC_TRIM_FILE _IOW(F2FS_IOCTL_MAGIC, 20, \ struct f2fs_sectrim_range) +#define F2FS_IOC_GET_COMPRESS_OPTION _IOR(F2FS_IOCTL_MAGIC, 21, \ + struct f2fs_comp_option) /* * should be same as XFS_IOC_GOINGDOWN. @@ -84,4 +86,9 @@ struct f2fs_sectrim_range { __u64 flags; }; +struct f2fs_comp_option { + __u8 algorithm; + __u8 log_cluster_size; +}; + #endif /* _UAPI_LINUX_F2FS_H */ From d29f4a5b03a430b99d88bfb71700916f18124ce4 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Fri, 30 Oct 2020 13:10:35 +0900 Subject: [PATCH 111/580] f2fs: add F2FS_IOC_SET_COMPRESS_OPTION ioctl Added a new F2FS_IOC_SET_COMPRESS_OPTION ioctl to change file compression option of a file. struct f2fs_comp_option { u8 algorithm; => compression algorithm => 0:lzo, 1:lz4, 2:zstd, 3:lzorle u8 log_cluster_size; => log scale cluster size => 2 ~ 8 }; struct f2fs_comp_option option; option.algorithm = 1; option.log_cluster_size = 7; ioctl(fd, F2FS_IOC_SET_COMPRESS_OPTION, &option); Signed-off-by: Daeho Jeong [Chao Yu: remove f2fs_is_compress_algorithm_valid()] Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 54 +++++++++++++++++++++++++++++++++++++++ include/uapi/linux/f2fs.h | 2 ++ 2 files changed, 56 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 90b502bf2a39..a837544bf51a 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -3994,6 +3994,57 @@ static int f2fs_ioc_get_compress_option(struct file *filp, unsigned long arg) return 0; } +static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_comp_option option; + int ret = 0; + + if (!f2fs_sb_has_compression(sbi)) + return -EOPNOTSUPP; + + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(&option, (struct f2fs_comp_option __user *)arg, + sizeof(option))) + return -EFAULT; + + if (!f2fs_compressed_file(inode) || + option.log_cluster_size < MIN_COMPRESS_LOG_SIZE || + option.log_cluster_size > MAX_COMPRESS_LOG_SIZE || + option.algorithm >= COMPRESS_MAX) + return -EINVAL; + + file_start_write(filp); + inode_lock(inode); + + if (f2fs_is_mmap_file(inode) || get_dirty_pages(inode)) { + ret = -EBUSY; + goto out; + } + + if (inode->i_size != 0) { + ret = -EFBIG; + goto out; + } + + F2FS_I(inode)->i_compress_algorithm = option.algorithm; + F2FS_I(inode)->i_log_cluster_size = option.log_cluster_size; + F2FS_I(inode)->i_cluster_size = 1 << option.log_cluster_size; + f2fs_mark_inode_dirty_sync(inode, true); + + if (!f2fs_is_compress_backend_ready(inode)) + f2fs_warn(sbi, "compression algorithm is successfully set, " + "but current kernel doesn't support this algorithm."); +out: + inode_unlock(inode); + file_end_write(filp); + + return ret; +} + long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp))))) @@ -4084,6 +4135,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_sec_trim_file(filp, arg); case F2FS_IOC_GET_COMPRESS_OPTION: return f2fs_ioc_get_compress_option(filp, arg); + case F2FS_IOC_SET_COMPRESS_OPTION: + return f2fs_ioc_set_compress_option(filp, arg); default: return -ENOTTY; } @@ -4254,6 +4307,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_RESERVE_COMPRESS_BLOCKS: case F2FS_IOC_SEC_TRIM_FILE: case F2FS_IOC_GET_COMPRESS_OPTION: + case F2FS_IOC_SET_COMPRESS_OPTION: break; default: return -ENOIOCTLCMD; diff --git a/include/uapi/linux/f2fs.h b/include/uapi/linux/f2fs.h index 872e61d78f29..f00199a2e38b 100644 --- a/include/uapi/linux/f2fs.h +++ b/include/uapi/linux/f2fs.h @@ -38,6 +38,8 @@ struct f2fs_sectrim_range) #define F2FS_IOC_GET_COMPRESS_OPTION _IOR(F2FS_IOCTL_MAGIC, 21, \ struct f2fs_comp_option) +#define F2FS_IOC_SET_COMPRESS_OPTION _IOW(F2FS_IOCTL_MAGIC, 22, \ + struct f2fs_comp_option) /* * should be same as XFS_IOC_GOINGDOWN. From d3dfd47d65e36dfeccc5f59b05ea6dd95d87ebdd Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 10 Nov 2020 09:24:36 +0800 Subject: [PATCH 112/580] f2fs: avoid unneeded data copy in f2fs_ioc_move_range() Fields in struct f2fs_move_range won't change in f2fs_ioc_move_range(), let's avoid copying this structure's data to userspace. Signed-off-by: Chao Yu Reviewed-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a837544bf51a..54b22e4111f2 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2920,12 +2920,6 @@ static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) range.pos_out, range.len); mnt_drop_write_file(filp); - if (err) - goto err_out; - - if (copy_to_user((struct f2fs_move_range __user *)arg, - &range, sizeof(range))) - err = -EFAULT; err_out: fdput(dst); return err; From 61546ab5bab62ff06ad2f960bb57a60ecce0a315 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 10 Nov 2020 09:24:37 +0800 Subject: [PATCH 113/580] f2fs: fix compat F2FS_IOC_{MOVE,GARBAGE_COLLECT}_RANGE Eric reported a ioctl bug in below link: https://lore.kernel.org/linux-f2fs-devel/20201103032234.GB2875@sol.localdomain/ That said, on some 32-bit architectures, u64 has only 32-bit alignment, notably i386 and x86_32, so that size of struct f2fs_gc_range compiled in x86_32 is 20 bytes, however the size in x86_64 is 24 bytes, binary compiled in x86_32 can not call F2FS_IOC_GARBAGE_COLLECT_RANGE successfully due to mismatched value of ioctl command in between binary and f2fs module, similarly, F2FS_IOC_MOVE_RANGE will fail too. In this patch we introduce two ioctls for compatibility of above special 32-bit binary: - F2FS_IOC32_GARBAGE_COLLECT_RANGE - F2FS_IOC32_MOVE_RANGE Reported-by: Eric Biggers Signed-off-by: Chao Yu Reviewed-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 137 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 104 insertions(+), 33 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 54b22e4111f2..d50b15488313 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2502,26 +2502,19 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) return ret; } -static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) +static int __f2fs_ioc_gc_range(struct file *filp, struct f2fs_gc_range *range) { - struct inode *inode = file_inode(filp); - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_gc_range range; + struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(filp)); u64 end; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - - if (copy_from_user(&range, (struct f2fs_gc_range __user *)arg, - sizeof(range))) - return -EFAULT; - if (f2fs_readonly(sbi->sb)) return -EROFS; - end = range.start + range.len; - if (end < range.start || range.start < MAIN_BLKADDR(sbi) || + end = range->start + range->len; + if (end < range->start || range->start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi)) return -EINVAL; @@ -2530,7 +2523,7 @@ static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) return ret; do_more: - if (!range.sync) { + if (!range->sync) { if (!down_write_trylock(&sbi->gc_lock)) { ret = -EBUSY; goto out; @@ -2539,20 +2532,30 @@ static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) down_write(&sbi->gc_lock); } - ret = f2fs_gc(sbi, range.sync, true, GET_SEGNO(sbi, range.start)); + ret = f2fs_gc(sbi, range->sync, true, GET_SEGNO(sbi, range->start)); if (ret) { if (ret == -EBUSY) ret = -EAGAIN; goto out; } - range.start += BLKS_PER_SEC(sbi); - if (range.start <= end) + range->start += BLKS_PER_SEC(sbi); + if (range->start <= end) goto do_more; out: mnt_drop_write_file(filp); return ret; } +static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) +{ + struct f2fs_gc_range range; + + if (copy_from_user(&range, (struct f2fs_gc_range __user *)arg, + sizeof(range))) + return -EFAULT; + return __f2fs_ioc_gc_range(filp, &range); +} + static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -2889,9 +2892,9 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, return ret; } -static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) +static int __f2fs_ioc_move_range(struct file *filp, + struct f2fs_move_range *range) { - struct f2fs_move_range range; struct fd dst; int err; @@ -2899,11 +2902,7 @@ static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) !(filp->f_mode & FMODE_WRITE)) return -EBADF; - if (copy_from_user(&range, (struct f2fs_move_range __user *)arg, - sizeof(range))) - return -EFAULT; - - dst = fdget(range.dst_fd); + dst = fdget(range->dst_fd); if (!dst.file) return -EBADF; @@ -2916,8 +2915,8 @@ static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) if (err) goto err_out; - err = f2fs_move_file_range(filp, range.pos_in, dst.file, - range.pos_out, range.len); + err = f2fs_move_file_range(filp, range->pos_in, dst.file, + range->pos_out, range->len); mnt_drop_write_file(filp); err_out: @@ -2925,6 +2924,16 @@ static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) return err; } +static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) +{ + struct f2fs_move_range range; + + if (copy_from_user(&range, (struct f2fs_move_range __user *)arg, + sizeof(range))) + return -EFAULT; + return __f2fs_ioc_move_range(filp, &range); +} + static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -4039,13 +4048,8 @@ static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg) return ret; } -long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp))))) - return -EIO; - if (!f2fs_is_checkpoint_ready(F2FS_I_SB(file_inode(filp)))) - return -ENOSPC; - switch (cmd) { case FS_IOC_GETFLAGS: return f2fs_ioc_getflags(filp, arg); @@ -4136,6 +4140,16 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } } +long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp))))) + return -EIO; + if (!f2fs_is_checkpoint_ready(F2FS_I_SB(file_inode(filp)))) + return -ENOSPC; + + return __f2fs_ioctl(filp, cmd, arg); +} + static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; @@ -4252,8 +4266,63 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) } #ifdef CONFIG_COMPAT +struct compat_f2fs_gc_range { + u32 sync; + compat_u64 start; + compat_u64 len; +}; +#define F2FS_IOC32_GARBAGE_COLLECT_RANGE _IOW(F2FS_IOCTL_MAGIC, 11,\ + struct compat_f2fs_gc_range) + +static int f2fs_compat_ioc_gc_range(struct file *file, unsigned long arg) +{ + struct compat_f2fs_gc_range __user *urange; + struct f2fs_gc_range range; + int err; + + urange = compat_ptr(arg); + err = get_user(range.sync, &urange->sync); + err |= get_user(range.start, &urange->start); + err |= get_user(range.len, &urange->len); + if (err) + return -EFAULT; + + return __f2fs_ioc_gc_range(file, &range); +} + +struct compat_f2fs_move_range { + u32 dst_fd; + compat_u64 pos_in; + compat_u64 pos_out; + compat_u64 len; +}; +#define F2FS_IOC32_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \ + struct compat_f2fs_move_range) + +static int f2fs_compat_ioc_move_range(struct file *file, unsigned long arg) +{ + struct compat_f2fs_move_range __user *urange; + struct f2fs_move_range range; + int err; + + urange = compat_ptr(arg); + err = get_user(range.dst_fd, &urange->dst_fd); + err |= get_user(range.pos_in, &urange->pos_in); + err |= get_user(range.pos_out, &urange->pos_out); + err |= get_user(range.len, &urange->len); + if (err) + return -EFAULT; + + return __f2fs_ioc_move_range(file, &range); +} + long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { + if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(file))))) + return -EIO; + if (!f2fs_is_checkpoint_ready(F2FS_I_SB(file_inode(file)))) + return -ENOSPC; + switch (cmd) { case FS_IOC32_GETFLAGS: cmd = FS_IOC_GETFLAGS; @@ -4264,6 +4333,10 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case FS_IOC32_GETVERSION: cmd = FS_IOC_GETVERSION; break; + case F2FS_IOC32_GARBAGE_COLLECT_RANGE: + return f2fs_compat_ioc_gc_range(file, arg); + case F2FS_IOC32_MOVE_RANGE: + return f2fs_compat_ioc_move_range(file, arg); case F2FS_IOC_START_ATOMIC_WRITE: case F2FS_IOC_COMMIT_ATOMIC_WRITE: case F2FS_IOC_START_VOLATILE_WRITE: @@ -4280,10 +4353,8 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case FS_IOC_GET_ENCRYPTION_KEY_STATUS: case FS_IOC_GET_ENCRYPTION_NONCE: case F2FS_IOC_GARBAGE_COLLECT: - case F2FS_IOC_GARBAGE_COLLECT_RANGE: case F2FS_IOC_WRITE_CHECKPOINT: case F2FS_IOC_DEFRAGMENT: - case F2FS_IOC_MOVE_RANGE: case F2FS_IOC_FLUSH_DEVICE: case F2FS_IOC_GET_FEATURES: case FS_IOC_FSGETXATTR: @@ -4306,7 +4377,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) default: return -ENOIOCTLCMD; } - return f2fs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); + return __f2fs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); } #endif From 64f8bff5d97f807fbfef34c19932d6cfafab116d Mon Sep 17 00:00:00 2001 From: Hyeongseok Kim Date: Thu, 12 Nov 2020 18:14:54 +0900 Subject: [PATCH 114/580] f2fs: fix double free of unicode map In case of retrying fill_super with skip_recovery, s_encoding for casefold would not be loaded again even though it's already been freed because it's not NULL. Set NULL after free to prevent double freeing when unmount. Fixes: eca4873ee1b6 ("f2fs: Use generic casefolding support") Signed-off-by: Hyeongseok Kim Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3399728002bc..0fd0c6f08838 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3867,6 +3867,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) #ifdef CONFIG_UNICODE utf8_unload(sb->s_encoding); + sb->s_encoding = NULL; #endif free_options: #ifdef CONFIG_QUOTA From 8e3428d13cc501ac35355ed619c9f327f1145e1f Mon Sep 17 00:00:00 2001 From: Liu Song Date: Wed, 18 Nov 2020 22:01:04 +0800 Subject: [PATCH 115/580] f2fs: remove writeback_inodes_sb in f2fs_remount Since sync_inodes_sb has been used, there is no need to use writeback_inodes_sb, so remove it. Signed-off-by: Liu Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 0fd0c6f08838..293c39a5f43b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1891,7 +1891,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if (*flags & SB_RDONLY || F2FS_OPTION(sbi).whint_mode != org_mount_opt.whint_mode) { - writeback_inodes_sb(sb, WB_REASON_SYNC); sync_inodes_sb(sb); set_sbi_flag(sbi, SBI_IS_DIRTY); From f8ee2966317722b7040ebd7d8457b34662eea607 Mon Sep 17 00:00:00 2001 From: Zhang Qilong Date: Fri, 20 Nov 2020 21:33:34 +0800 Subject: [PATCH 116/580] f2fs: Remove the redundancy initialization There are two assignments are meaningless, and remove them. Signed-off-by: Zhang Qilong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 2 +- fs/f2fs/checkpoint.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 306413589827..1e5e9b1136ee 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -384,7 +384,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, struct page *dpage) { struct posix_acl *default_acl = NULL, *acl = NULL; - int error = 0; + int error; error = f2fs_acl_create(dir, &inode->i_mode, &default_acl, &acl, dpage); if (error) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 023462e80e58..9b0628e0d8bc 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -37,7 +37,7 @@ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) { struct address_space *mapping = META_MAPPING(sbi); - struct page *page = NULL; + struct page *page; repeat: page = f2fs_grab_cache_page(mapping, index, false); if (!page) { From edc5270183616efe398afa719e1a1273de6ec326 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Thu, 19 Nov 2020 06:09:02 +0000 Subject: [PATCH 117/580] libfs: Add generic function for setting dentry_ops This adds a function to set dentry operations at lookup time that will work for both encrypted filenames and casefolded filenames. A filesystem that supports both features simultaneously can use this function during lookup preparations to set up its dentry operations once fscrypt no longer does that itself. Currently the casefolding dentry operation are always set if the filesystem defines an encoding because the features is toggleable on empty directories. Unlike in the encryption case, the dentry operations used come from the parent. Since we don't know what set of functions we'll eventually need, and cannot change them later, we enable the casefolding operations if the filesystem supports them at all. By splitting out the various cases, we support as few dentry operations as we can get away with, maximizing compatibility with overlayfs, which will not function if a filesystem supports certain dentry_operations. Signed-off-by: Daniel Rosenberg Reviewed-by: Theodore Ts'o Reviewed-by: Eric Biggers Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jaegeuk Kim --- fs/libfs.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/fs.h | 1 + 2 files changed, 71 insertions(+) diff --git a/fs/libfs.c b/fs/libfs.c index aca24b05ca9e..b82041854991 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1340,4 +1340,74 @@ int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) return 0; } EXPORT_SYMBOL(generic_ci_d_hash); + +static const struct dentry_operations generic_ci_dentry_ops = { + .d_hash = generic_ci_d_hash, + .d_compare = generic_ci_d_compare, +}; #endif + +#ifdef CONFIG_FS_ENCRYPTION +static const struct dentry_operations generic_encrypted_dentry_ops = { + .d_revalidate = fscrypt_d_revalidate, +}; +#endif + +#if defined(CONFIG_FS_ENCRYPTION) && defined(CONFIG_UNICODE) +static const struct dentry_operations generic_encrypted_ci_dentry_ops = { + .d_hash = generic_ci_d_hash, + .d_compare = generic_ci_d_compare, + .d_revalidate = fscrypt_d_revalidate, +}; +#endif + +/** + * generic_set_encrypted_ci_d_ops - helper for setting d_ops for given dentry + * @dentry: dentry to set ops on + * + * Casefolded directories need d_hash and d_compare set, so that the dentries + * contained in them are handled case-insensitively. Note that these operations + * are needed on the parent directory rather than on the dentries in it, and + * while the casefolding flag can be toggled on and off on an empty directory, + * dentry_operations can't be changed later. As a result, if the filesystem has + * casefolding support enabled at all, we have to give all dentries the + * casefolding operations even if their inode doesn't have the casefolding flag + * currently (and thus the casefolding ops would be no-ops for now). + * + * Encryption works differently in that the only dentry operation it needs is + * d_revalidate, which it only needs on dentries that have the no-key name flag. + * The no-key flag can't be set "later", so we don't have to worry about that. + * + * Finally, to maximize compatibility with overlayfs (which isn't compatible + * with certain dentry operations) and to avoid taking an unnecessary + * performance hit, we use custom dentry_operations for each possible + * combination rather than always installing all operations. + */ +void generic_set_encrypted_ci_d_ops(struct dentry *dentry) +{ +#ifdef CONFIG_FS_ENCRYPTION + bool needs_encrypt_ops = dentry->d_flags & DCACHE_NOKEY_NAME; +#endif +#ifdef CONFIG_UNICODE + bool needs_ci_ops = dentry->d_sb->s_encoding; +#endif +#if defined(CONFIG_FS_ENCRYPTION) && defined(CONFIG_UNICODE) + if (needs_encrypt_ops && needs_ci_ops) { + d_set_d_op(dentry, &generic_encrypted_ci_dentry_ops); + return; + } +#endif +#ifdef CONFIG_FS_ENCRYPTION + if (needs_encrypt_ops) { + d_set_d_op(dentry, &generic_encrypted_dentry_ops); + return; + } +#endif +#ifdef CONFIG_UNICODE + if (needs_ci_ops) { + d_set_d_op(dentry, &generic_ci_dentry_ops); + return; + } +#endif +} +EXPORT_SYMBOL(generic_set_encrypted_ci_d_ops); diff --git a/include/linux/fs.h b/include/linux/fs.h index c192bb8ebfb3..24e5bc507eca 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3256,6 +3256,7 @@ extern int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str); extern int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); #endif +extern void generic_set_encrypted_ci_d_ops(struct dentry *dentry); #ifdef CONFIG_MIGRATION extern int buffer_migrate_page(struct address_space *, From 04d57a86ea10e801c8d48c74b99c451ea11704d6 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Thu, 19 Nov 2020 06:09:03 +0000 Subject: [PATCH 118/580] fscrypt: Have filesystems handle their d_ops This shifts the responsibility of setting up dentry operations from fscrypt to the individual filesystems, allowing them to have their own operations while still setting fscrypt's d_revalidate as appropriate. Most filesystems can just use generic_set_encrypted_ci_d_ops, unless they have their own specific dentry operations as well. That operation will set the minimal d_ops required under the circumstances. Since the fscrypt d_ops are set later on, we must set all d_ops there, since we cannot adjust those later on. This should not result in any change in behavior. Signed-off-by: Daniel Rosenberg Acked-by: Theodore Ts'o Acked-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/crypto/fname.c | 4 ---- fs/crypto/fscrypt_private.h | 1 - fs/crypto/hooks.c | 1 - fs/ext4/dir.c | 48 ------------------------------------- fs/ext4/ext4.h | 4 ---- fs/ext4/namei.c | 1 + fs/ext4/super.c | 5 ---- fs/f2fs/dir.c | 7 ------ fs/f2fs/f2fs.h | 3 --- fs/f2fs/namei.c | 1 + fs/f2fs/super.c | 1 - fs/ubifs/dir.c | 1 + include/linux/fscrypt.h | 7 ++++-- 13 files changed, 8 insertions(+), 76 deletions(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index e50d670350f2..375b790b9565 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -608,7 +608,3 @@ int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) return valid; } EXPORT_SYMBOL_GPL(fscrypt_d_revalidate); - -const struct dentry_operations fscrypt_d_ops = { - .d_revalidate = fscrypt_d_revalidate, -}; diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index eb7fcd2b7fb8..40f27d95ede3 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -270,7 +270,6 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, u8 *out, unsigned int olen); bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, u32 max_len, u32 *encrypted_len_ret); -extern const struct dentry_operations fscrypt_d_ops; /* hkdf.c */ diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 7099065154cb..5e262d552188 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -116,7 +116,6 @@ int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, spin_lock(&dentry->d_lock); dentry->d_flags |= DCACHE_NOKEY_NAME; spin_unlock(&dentry->d_lock); - d_set_d_op(dentry, &fscrypt_d_ops); } return err; } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 6f420e4f71c2..f5b1300b7572 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -661,51 +661,3 @@ const struct file_operations ext4_dir_operations = { .open = ext4_dir_open, .release = ext4_release_dir, }; - -#ifdef CONFIG_UNICODE -static int ext4_d_compare(const struct dentry *dentry, unsigned int len, - const char *str, const struct qstr *name) -{ - struct qstr qstr = {.name = str, .len = len }; - struct inode *inode = dentry->d_parent->d_inode; - - if (!IS_CASEFOLDED(inode) || !EXT4_SB(inode->i_sb)->s_encoding) { - if (len != name->len) - return -1; - return memcmp(str, name->name, len); - } - - return ext4_ci_compare(inode, name, &qstr, false); -} - -static int ext4_d_hash(const struct dentry *dentry, struct qstr *str) -{ - const struct ext4_sb_info *sbi = EXT4_SB(dentry->d_sb); - const struct unicode_map *um = sbi->s_encoding; - unsigned char *norm; - int len, ret = 0; - - if (!IS_CASEFOLDED(dentry->d_inode) || !um) - return 0; - - norm = kmalloc(PATH_MAX, GFP_ATOMIC); - if (!norm) - return -ENOMEM; - - len = utf8_casefold(um, str, norm, PATH_MAX); - if (len < 0) { - if (ext4_has_strict_mode(sbi)) - ret = -EINVAL; - goto out; - } - str->hash = full_name_hash(dentry, norm, len); -out: - kfree(norm); - return ret; -} - -const struct dentry_operations ext4_dentry_ops = { - .d_hash = ext4_d_hash, - .d_compare = ext4_d_compare, -}; -#endif diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a804384ad88d..8e762cda0ed9 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3057,10 +3057,6 @@ static inline void ext4_unlock_group(struct super_block *sb, /* dir.c */ extern const struct file_operations ext4_dir_operations; -#ifdef CONFIG_UNICODE -extern const struct dentry_operations ext4_dentry_ops; -#endif - /* file.c */ extern const struct inode_operations ext4_file_inode_operations; extern const struct file_operations ext4_file_operations; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 1531ba79e744..187de3344afc 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1591,6 +1591,7 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir, struct buffer_head *bh; err = ext4_fname_prepare_lookup(dir, dentry, &fname); + generic_set_encrypted_ci_d_ops(dentry); if (err == -ENOENT) return NULL; if (err) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 13089271d7bb..c44e2da78312 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4470,11 +4470,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount4; } -#ifdef CONFIG_UNICODE - if (sbi->s_encoding) - sb->s_d_op = &ext4_dentry_ops; -#endif - sb->s_root = d_make_root(root); if (!sb->s_root) { ext4_msg(sb, KERN_ERR, "get root dentry failed"); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 80f23315b1a1..50e22d696ae5 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -1099,10 +1099,3 @@ const struct file_operations f2fs_dir_operations = { .compat_ioctl = f2fs_compat_ioctl, #endif }; - -#ifdef CONFIG_UNICODE -const struct dentry_operations f2fs_dentry_ops = { - .d_hash = generic_ci_d_hash, - .d_compare = generic_ci_d_compare, -}; -#endif diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 14ba62a55bca..db3c334ff94a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3701,9 +3701,6 @@ static inline void f2fs_update_sit_info(struct f2fs_sb_info *sbi) {} #endif extern const struct file_operations f2fs_dir_operations; -#ifdef CONFIG_UNICODE -extern const struct dentry_operations f2fs_dentry_ops; -#endif extern const struct file_operations f2fs_file_operations; extern const struct inode_operations f2fs_file_inode_operations; extern const struct address_space_operations f2fs_dblock_aops; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 90565432559c..70a8e516fd32 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -492,6 +492,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, } err = f2fs_prepare_lookup(dir, dentry, &fname); + generic_set_encrypted_ci_d_ops(dentry); if (err == -ENOENT) goto out_splice; if (err) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 293c39a5f43b..d0fc95ec10e2 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3386,7 +3386,6 @@ static int f2fs_setup_casefold(struct f2fs_sb_info *sbi) sbi->sb->s_encoding = encoding; sbi->sb->s_encoding_flags = encoding_flags; - sbi->sb->s_d_op = &f2fs_dentry_ops; } #else if (f2fs_sb_has_casefold(sbi)) { diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 56c2f9971b37..92dedaf04f77 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -221,6 +221,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, dbg_gen("'%pd' in dir ino %lu", dentry, dir->i_ino); err = fscrypt_prepare_lookup(dir, dentry, &nm); + generic_set_encrypted_ci_d_ops(dentry); if (err == -ENOENT) return d_splice_alias(NULL, dentry); if (err) diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 0362699f7c86..96ced3cb8c31 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -633,8 +633,11 @@ static inline int fscrypt_prepare_rename(struct inode *old_dir, * directory's encryption key is available, then the lookup is assumed to be by * plaintext name; otherwise, it is assumed to be by no-key name. * - * This also installs a custom ->d_revalidate() method which will invalidate the - * dentry if it was created without the key and the key is later added. + * This will set DCACHE_NOKEY_NAME on the dentry if the lookup is by no-key + * name. In this case the filesystem must assign the dentry a dentry_operations + * which contains fscrypt_d_revalidate (or contains a d_revalidate method that + * calls fscrypt_d_revalidate), so that the dentry will be invalidated if the + * directory's encryption key is later added. * * Return: 0 on success; -ENOENT if the directory's key is unavailable but the * filename isn't a valid no-key name, so a negative dentry should be created; From 2ea369bdcc909d77f2ba7ee7338907cc611a10f2 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Thu, 19 Nov 2020 06:09:04 +0000 Subject: [PATCH 119/580] f2fs: Handle casefolding with Encryption Expand f2fs's casefolding support to include encrypted directories. To index casefolded+encrypted directories, we use the SipHash of the casefolded name, keyed by a key derived from the directory's fscrypt master key. This ensures that the dirhash doesn't leak information about the plaintext filenames. Encryption keys are unavailable during roll-forward recovery, so we can't compute the dirhash when recovering a new dentry in an encrypted + casefolded directory. To avoid having to force a checkpoint when a new file is fsync'ed, store the dirhash on-disk appended to i_name. This patch incorporates work by Eric Biggers and Jaegeuk Kim . Co-developed-by: Eric Biggers Signed-off-by: Eric Biggers Signed-off-by: Daniel Rosenberg Reviewed-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 100 ++++++++++++++++++++++++++++++++++----------- fs/f2fs/f2fs.h | 8 ++-- fs/f2fs/hash.c | 11 ++++- fs/f2fs/inline.c | 4 ++ fs/f2fs/recovery.c | 12 +++++- fs/f2fs/super.c | 6 --- 6 files changed, 107 insertions(+), 34 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 50e22d696ae5..5f067e7e35e7 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -5,6 +5,7 @@ * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ */ +#include #include #include #include @@ -206,30 +207,55 @@ static struct f2fs_dir_entry *find_in_block(struct inode *dir, /* * Test whether a case-insensitive directory entry matches the filename * being searched for. + * + * Returns 1 for a match, 0 for no match, and -errno on an error. */ -static bool f2fs_match_ci_name(const struct inode *dir, const struct qstr *name, +static int f2fs_match_ci_name(const struct inode *dir, const struct qstr *name, const u8 *de_name, u32 de_name_len) { const struct super_block *sb = dir->i_sb; const struct unicode_map *um = sb->s_encoding; + struct fscrypt_str decrypted_name = FSTR_INIT(NULL, de_name_len); struct qstr entry = QSTR_INIT(de_name, de_name_len); int res; - res = utf8_strncasecmp_folded(um, name, &entry); - if (res < 0) { - /* - * In strict mode, ignore invalid names. In non-strict mode, - * fall back to treating them as opaque byte sequences. - */ - if (sb_has_strict_encoding(sb) || name->len != entry.len) - return false; - return !memcmp(name->name, entry.name, name->len); + if (IS_ENCRYPTED(dir)) { + const struct fscrypt_str encrypted_name = + FSTR_INIT((u8 *)de_name, de_name_len); + + if (WARN_ON_ONCE(!fscrypt_has_encryption_key(dir))) + return -EINVAL; + + decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL); + if (!decrypted_name.name) + return -ENOMEM; + res = fscrypt_fname_disk_to_usr(dir, 0, 0, &encrypted_name, + &decrypted_name); + if (res < 0) + goto out; + entry.name = decrypted_name.name; + entry.len = decrypted_name.len; } - return res == 0; + + res = utf8_strncasecmp_folded(um, name, &entry); + /* + * In strict mode, ignore invalid names. In non-strict mode, + * fall back to treating them as opaque byte sequences. + */ + if (res < 0 && !sb_has_strict_encoding(sb)) { + res = name->len == entry.len && + memcmp(name->name, entry.name, name->len) == 0; + } else { + /* utf8_strncasecmp_folded returns 0 on match */ + res = (res == 0); + } +out: + kfree(decrypted_name.name); + return res; } #endif /* CONFIG_UNICODE */ -static inline bool f2fs_match_name(const struct inode *dir, +static inline int f2fs_match_name(const struct inode *dir, const struct f2fs_filename *fname, const u8 *de_name, u32 de_name_len) { @@ -256,6 +282,7 @@ struct f2fs_dir_entry *f2fs_find_target_dentry(const struct f2fs_dentry_ptr *d, struct f2fs_dir_entry *de; unsigned long bit_pos = 0; int max_len = 0; + int res = 0; if (max_slots) *max_slots = 0; @@ -273,10 +300,15 @@ struct f2fs_dir_entry *f2fs_find_target_dentry(const struct f2fs_dentry_ptr *d, continue; } - if (de->hash_code == fname->hash && - f2fs_match_name(d->inode, fname, d->filename[bit_pos], - le16_to_cpu(de->name_len))) - goto found; + if (de->hash_code == fname->hash) { + res = f2fs_match_name(d->inode, fname, + d->filename[bit_pos], + le16_to_cpu(de->name_len)); + if (res < 0) + return ERR_PTR(res); + if (res) + goto found; + } if (max_slots && max_len > *max_slots) *max_slots = max_len; @@ -326,7 +358,11 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, } de = find_in_block(dir, dentry_page, fname, &max_slots); - if (de) { + if (IS_ERR(de)) { + *res_page = ERR_CAST(de); + de = NULL; + break; + } else if (de) { *res_page = dentry_page; break; } @@ -448,17 +484,39 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, f2fs_put_page(page, 1); } -static void init_dent_inode(const struct f2fs_filename *fname, +static void init_dent_inode(struct inode *dir, struct inode *inode, + const struct f2fs_filename *fname, struct page *ipage) { struct f2fs_inode *ri; + if (!fname) /* tmpfile case? */ + return; + f2fs_wait_on_page_writeback(ipage, NODE, true, true); /* copy name info. to this inode page */ ri = F2FS_INODE(ipage); ri->i_namelen = cpu_to_le32(fname->disk_name.len); memcpy(ri->i_name, fname->disk_name.name, fname->disk_name.len); + if (IS_ENCRYPTED(dir)) { + file_set_enc_name(inode); + /* + * Roll-forward recovery doesn't have encryption keys available, + * so it can't compute the dirhash for encrypted+casefolded + * filenames. Append it to i_name if possible. Else, disable + * roll-forward recovery of the dentry (i.e., make fsync'ing the + * file force a checkpoint) by setting LOST_PINO. + */ + if (IS_CASEFOLDED(dir)) { + if (fname->disk_name.len + sizeof(f2fs_hash_t) <= + F2FS_NAME_LEN) + put_unaligned(fname->hash, (f2fs_hash_t *) + &ri->i_name[fname->disk_name.len]); + else + file_lost_pino(inode); + } + } set_page_dirty(ipage); } @@ -541,11 +599,7 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, return page; } - if (fname) { - init_dent_inode(fname, page); - if (IS_ENCRYPTED(dir)) - file_set_enc_name(inode); - } + init_dent_inode(dir, inode, fname, page); /* * This file should be checkpointed during fsync. diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index db3c334ff94a..51a44ee1ddf2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -453,9 +453,11 @@ struct f2fs_filename { #ifdef CONFIG_UNICODE /* * For casefolded directories: the casefolded name, but it's left NULL - * if the original name is not valid Unicode or if the filesystem is - * doing an internal operation where usr_fname is also NULL. In these - * cases we fall back to treating the name as an opaque byte sequence. + * if the original name is not valid Unicode, if the directory is both + * casefolded and encrypted and its encryption key is unavailable, or if + * the filesystem is doing an internal operation where usr_fname is also + * NULL. In all these cases we fall back to treating the name as an + * opaque byte sequence. */ struct fscrypt_str cf_name; #endif diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index e5997919472d..f9b706495d1d 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -112,7 +112,9 @@ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname) * If the casefolded name is provided, hash it instead of the * on-disk name. If the casefolded name is *not* provided, that * should only be because the name wasn't valid Unicode, so fall - * back to treating the name as an opaque byte sequence. + * back to treating the name as an opaque byte sequence. Note + * that to handle encrypted directories, the fallback must use + * usr_fname (plaintext) rather than disk_name (ciphertext). */ WARN_ON_ONCE(!fname->usr_fname->name); if (fname->cf_name.name) { @@ -122,6 +124,13 @@ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname) name = fname->usr_fname->name; len = fname->usr_fname->len; } + if (IS_ENCRYPTED(dir)) { + struct qstr tmp = QSTR_INIT(name, len); + + fname->hash = + cpu_to_le32(fscrypt_fname_siphash(dir, &tmp)); + return; + } } #endif fname->hash = cpu_to_le32(TEA_hash_name(name, len)); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index cd1cc890765c..6b45c0acdd27 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -331,6 +331,10 @@ struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir, make_dentry_ptr_inline(dir, &d, inline_dentry); de = f2fs_find_target_dentry(&d, fname, NULL); unlock_page(ipage); + if (IS_ERR(de)) { + *res_page = ERR_CAST(de); + de = NULL; + } if (de) *res_page = ipage; else diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index c22149ea46fd..bf624dfa18af 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -5,6 +5,7 @@ * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ */ +#include #include #include #include "f2fs.h" @@ -128,7 +129,16 @@ static int init_recovered_filename(const struct inode *dir, } /* Compute the hash of the filename */ - if (IS_CASEFOLDED(dir)) { + if (IS_ENCRYPTED(dir) && IS_CASEFOLDED(dir)) { + /* + * In this case the hash isn't computable without the key, so it + * was saved on-disk. + */ + if (fname->disk_name.len + sizeof(f2fs_hash_t) > F2FS_NAME_LEN) + return -EINVAL; + fname->hash = get_unaligned((f2fs_hash_t *) + &raw_inode->i_name[fname->disk_name.len]); + } else if (IS_CASEFOLDED(dir)) { err = f2fs_init_casefolded_name(dir, fname); if (err) return err; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d0fc95ec10e2..258dbd31be40 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3358,12 +3358,6 @@ static int f2fs_setup_casefold(struct f2fs_sb_info *sbi) struct unicode_map *encoding; __u16 encoding_flags; - if (f2fs_sb_has_encrypt(sbi)) { - f2fs_err(sbi, - "Can't mount with encoding and encryption"); - return -EINVAL; - } - if (f2fs_sb_read_encoding(sbi->raw_super, &encoding_info, &encoding_flags)) { f2fs_err(sbi, From cd21cc9af7eee791cc4d71a7b348518a46254ded Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Mon, 23 Nov 2020 10:58:32 +0530 Subject: [PATCH 120/580] f2fs: change to use rwsem for cp_mutex Use rwsem to ensure serialization of the callers and to avoid starvation of high priority tasks, when the system is under heavy IO workload. Signed-off-by: Sahitya Tummala Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 8 ++++---- fs/f2fs/f2fs.h | 2 +- fs/f2fs/gc.c | 4 ++-- fs/f2fs/recovery.c | 4 ++-- fs/f2fs/super.c | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 9b0628e0d8bc..e5d2ffa80b56 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -348,13 +348,13 @@ static int f2fs_write_meta_pages(struct address_space *mapping, goto skip_write; /* if locked failed, cp will flush dirty pages instead */ - if (!mutex_trylock(&sbi->cp_mutex)) + if (!down_write_trylock(&sbi->cp_global_sem)) goto skip_write; trace_f2fs_writepages(mapping->host, wbc, META); diff = nr_pages_to_write(sbi, META, wbc); written = f2fs_sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO); - mutex_unlock(&sbi->cp_mutex); + up_write(&sbi->cp_global_sem); wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); return 0; @@ -1572,7 +1572,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_warn(sbi, "Start checkpoint disabled!"); } if (cpc->reason != CP_RESIZE) - mutex_lock(&sbi->cp_mutex); + down_write(&sbi->cp_global_sem); if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) && ((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) || @@ -1647,7 +1647,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); out: if (cpc->reason != CP_RESIZE) - mutex_unlock(&sbi->cp_mutex); + up_write(&sbi->cp_global_sem); return err; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 51a44ee1ddf2..bbda309f9e22 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1373,7 +1373,7 @@ struct f2fs_sb_info { int cur_cp_pack; /* remain current cp pack */ spinlock_t cp_lock; /* for flag in ckpt */ struct inode *meta_inode; /* cache meta blocks */ - struct mutex cp_mutex; /* checkpoint procedure lock */ + struct rw_semaphore cp_global_sem; /* checkpoint procedure lock */ struct rw_semaphore cp_rwsem; /* blocking FS operations */ struct rw_semaphore node_write; /* locking node writes */ struct rw_semaphore node_change; /* locking node change */ diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 05641a1e36cc..3ef84e6ded41 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1986,7 +1986,7 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) freeze_super(sbi->sb); down_write(&sbi->gc_lock); - mutex_lock(&sbi->cp_mutex); + down_write(&sbi->cp_global_sem); spin_lock(&sbi->stat_lock); if (shrunk_blocks + valid_user_blocks(sbi) + @@ -2031,7 +2031,7 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) spin_unlock(&sbi->stat_lock); } out_err: - mutex_unlock(&sbi->cp_mutex); + up_write(&sbi->cp_global_sem); up_write(&sbi->gc_lock); thaw_super(sbi->sb); clear_sbi_flag(sbi, SBI_IS_RESIZEFS); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index bf624dfa18af..a80526f4fed0 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -798,7 +798,7 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) INIT_LIST_HEAD(&dir_list); /* prevent checkpoint */ - mutex_lock(&sbi->cp_mutex); + down_write(&sbi->cp_global_sem); /* step #1: find fsynced inode numbers */ err = find_fsync_dnodes(sbi, &inode_list, check_only); @@ -834,7 +834,7 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) } else { clear_sbi_flag(sbi, SBI_POR_DOING); } - mutex_unlock(&sbi->cp_mutex); + up_write(&sbi->cp_global_sem); /* let's drop all the directory inodes for clean checkpoint */ destroy_fsync_dnodes(&dir_list, err); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 258dbd31be40..a04ad9610d47 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3511,7 +3511,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sbi->valid_super_block = valid_super_block; init_rwsem(&sbi->gc_lock); mutex_init(&sbi->writepages); - mutex_init(&sbi->cp_mutex); + init_rwsem(&sbi->cp_global_sem); init_rwsem(&sbi->node_write); init_rwsem(&sbi->node_change); From 1f1ece34c858a94b134c192fe48afd62fbcd4a0b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 25 Nov 2020 10:57:36 +0800 Subject: [PATCH 121/580] f2fs: fix to avoid REQ_TIME and CP_TIME collision Lei Li reported a issue: if foreground operations are frequent, background checkpoint may be always skipped due to below check, result in losing more data after sudden power-cut. f2fs_balance_fs_bg() ... if (!is_idle(sbi, REQ_TIME) && (!excess_dirty_nats(sbi) && !excess_dirty_nodes(sbi))) return; E.g: cp_interval = 5 second idle_interval = 2 second foreground operation interval = 1 second (append 1 byte per second into file) In such case, no matter when it calls f2fs_balance_fs_bg(), is_idle(, REQ_TIME) returns false, result in skipping background checkpoint. This patch changes as below to make trigger condition being more reasonable: - trigger sync_fs() if dirty_{nats,nodes} and prefree segs exceeds threshold; - skip triggering sync_fs() if there is any background inflight IO or there is foreground operation recently and meanwhile cp_rwsem is being held by someone; Reported-by: Lei Li Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 19 +++++++++++++------ fs/f2fs/segment.c | 43 +++++++++++++++++++++++++------------------ 2 files changed, 38 insertions(+), 24 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index bbda309f9e22..659d1062119a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2405,24 +2405,31 @@ static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, return entry; } -static inline bool is_idle(struct f2fs_sb_info *sbi, int type) +static inline bool is_inflight_io(struct f2fs_sb_info *sbi, int type) { - if (sbi->gc_mode == GC_URGENT_HIGH) - return true; - if (get_pages(sbi, F2FS_RD_DATA) || get_pages(sbi, F2FS_RD_NODE) || get_pages(sbi, F2FS_RD_META) || get_pages(sbi, F2FS_WB_DATA) || get_pages(sbi, F2FS_WB_CP_DATA) || get_pages(sbi, F2FS_DIO_READ) || get_pages(sbi, F2FS_DIO_WRITE)) - return false; + return true; if (type != DISCARD_TIME && SM_I(sbi) && SM_I(sbi)->dcc_info && atomic_read(&SM_I(sbi)->dcc_info->queued_discard)) - return false; + return true; if (SM_I(sbi) && SM_I(sbi)->fcc_info && atomic_read(&SM_I(sbi)->fcc_info->queued_flush)) + return true; + return false; +} + +static inline bool is_idle(struct f2fs_sb_info *sbi, int type) +{ + if (sbi->gc_mode == GC_URGENT_HIGH) + return true; + + if (is_inflight_io(sbi, type)) return false; if (sbi->gc_mode == GC_URGENT_LOW && diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 416785766b4c..f73e8ea1961c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -529,31 +529,38 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) else f2fs_build_free_nids(sbi, false, false); - if (!is_idle(sbi, REQ_TIME) && - (!excess_dirty_nats(sbi) && !excess_dirty_nodes(sbi))) + if (excess_dirty_nats(sbi) || excess_dirty_nodes(sbi) || + excess_prefree_segs(sbi)) + goto do_sync; + + /* there is background inflight IO or foreground operation recently */ + if (is_inflight_io(sbi, REQ_TIME) || + (!f2fs_time_over(sbi, REQ_TIME) && rwsem_is_locked(&sbi->cp_rwsem))) return; + /* exceed periodical checkpoint timeout threshold */ + if (f2fs_time_over(sbi, CP_TIME)) + goto do_sync; + /* checkpoint is the only way to shrink partial cached entries */ - if (!f2fs_available_free_memory(sbi, NAT_ENTRIES) || - !f2fs_available_free_memory(sbi, INO_ENTRIES) || - excess_prefree_segs(sbi) || - excess_dirty_nats(sbi) || - excess_dirty_nodes(sbi) || - f2fs_time_over(sbi, CP_TIME)) { - if (test_opt(sbi, DATA_FLUSH) && from_bg) { - struct blk_plug plug; + if (f2fs_available_free_memory(sbi, NAT_ENTRIES) || + f2fs_available_free_memory(sbi, INO_ENTRIES)) + return; - mutex_lock(&sbi->flush_lock); +do_sync: + if (test_opt(sbi, DATA_FLUSH) && from_bg) { + struct blk_plug plug; - blk_start_plug(&plug); - f2fs_sync_dirty_inodes(sbi, FILE_INODE); - blk_finish_plug(&plug); + mutex_lock(&sbi->flush_lock); - mutex_unlock(&sbi->flush_lock); - } - f2fs_sync_fs(sbi->sb, true); - stat_inc_bg_cp_count(sbi->stat_info); + blk_start_plug(&plug); + f2fs_sync_dirty_inodes(sbi, FILE_INODE); + blk_finish_plug(&plug); + + mutex_unlock(&sbi->flush_lock); } + f2fs_sync_fs(sbi->sb, true); + stat_inc_bg_cp_count(sbi->stat_info); } static int __submit_flush_wait(struct f2fs_sb_info *sbi, From 4d86838ad9595146537550d7f604e444a52d4d5d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 26 Nov 2020 18:32:09 +0800 Subject: [PATCH 122/580] f2fs: compress: support chksum This patch supports to store chksum value with compressed data, and verify the integrality of compressed data while reading the data. The feature can be enabled through specifying mount option 'compress_chksum'. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 23 +++++++++++++++++++++++ fs/f2fs/f2fs.h | 16 ++++++++++++++-- fs/f2fs/inode.c | 3 +++ fs/f2fs/super.c | 9 +++++++++ include/linux/f2fs_fs.h | 2 +- 5 files changed, 50 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 20643ac3638a..58db1ff6f3d5 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -573,6 +573,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc) f2fs_cops[fi->i_compress_algorithm]; unsigned int max_len, new_nr_cpages; struct page **new_cpages; + u32 chksum = 0; int i, ret; trace_f2fs_compress_pages_start(cc->inode, cc->cluster_idx, @@ -626,6 +627,11 @@ static int f2fs_compress_pages(struct compress_ctx *cc) cc->cbuf->clen = cpu_to_le32(cc->clen); + if (fi->i_compress_flag & 1 << COMPRESS_CHKSUM) + chksum = f2fs_crc32(F2FS_I_SB(cc->inode), + cc->cbuf->cdata, cc->clen); + cc->cbuf->chksum = cpu_to_le32(chksum); + for (i = 0; i < COMPRESS_DATA_RESERVED_SIZE; i++) cc->cbuf->reserved[i] = cpu_to_le32(0); @@ -761,6 +767,23 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) ret = cops->decompress_pages(dic); + if (!ret && fi->i_compress_flag & 1 << COMPRESS_CHKSUM) { + u32 provided = le32_to_cpu(dic->cbuf->chksum); + u32 calculated = f2fs_crc32(sbi, dic->cbuf->cdata, dic->clen); + + if (provided != calculated) { + if (!is_inode_flag_set(dic->inode, FI_COMPRESS_CORRUPT)) { + set_inode_flag(dic->inode, FI_COMPRESS_CORRUPT); + printk_ratelimited( + "%sF2FS-fs (%s): checksum invalid, nid = %lu, %x vs %x", + KERN_INFO, sbi->sb->s_id, dic->inode->i_ino, + provided, calculated); + } + set_sbi_flag(sbi, SBI_NEED_FSCK); + WARN_ON_ONCE(1); + } + } + out_vunmap_cbuf: vm_unmap_ram(dic->cbuf, dic->nr_cpages); out_vunmap_rbuf: diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 659d1062119a..1bdd3ac2540c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -146,7 +146,8 @@ struct f2fs_mount_info { /* For compression */ unsigned char compress_algorithm; /* algorithm type */ - unsigned compress_log_size; /* cluster log size */ + unsigned char compress_log_size; /* cluster log size */ + bool compress_chksum; /* compressed data chksum */ unsigned char compress_ext_cnt; /* extension count */ unsigned char extensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */ }; @@ -675,6 +676,7 @@ enum { FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */ FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ FI_COMPRESSED_FILE, /* indicate file's data can be compressed */ + FI_COMPRESS_CORRUPT, /* indicate compressed cluster is corrupted */ FI_MMAP_FILE, /* indicate file was mmapped */ FI_MAX, /* max flag, never be used */ }; @@ -732,6 +734,7 @@ struct f2fs_inode_info { atomic_t i_compr_blocks; /* # of compressed blocks */ unsigned char i_compress_algorithm; /* algorithm type */ unsigned char i_log_cluster_size; /* log of cluster size */ + unsigned short i_compress_flag; /* compress flag */ unsigned int i_cluster_size; /* cluster size */ }; @@ -1276,9 +1279,15 @@ enum compress_algorithm_type { COMPRESS_MAX, }; -#define COMPRESS_DATA_RESERVED_SIZE 5 +enum compress_flag { + COMPRESS_CHKSUM, + COMPRESS_MAX_FLAG, +}; + +#define COMPRESS_DATA_RESERVED_SIZE 4 struct compress_data { __le32 clen; /* compressed data size */ + __le32 chksum; /* compressed data chksum */ __le32 reserved[COMPRESS_DATA_RESERVED_SIZE]; /* reserved */ u8 cdata[]; /* compressed data */ }; @@ -3901,6 +3910,9 @@ static inline void set_compress_context(struct inode *inode) F2FS_OPTION(sbi).compress_algorithm; F2FS_I(inode)->i_log_cluster_size = F2FS_OPTION(sbi).compress_log_size; + F2FS_I(inode)->i_compress_flag = + F2FS_OPTION(sbi).compress_chksum ? + 1 << COMPRESS_CHKSUM : 0; F2FS_I(inode)->i_cluster_size = 1 << F2FS_I(inode)->i_log_cluster_size; F2FS_I(inode)->i_flags |= F2FS_COMPR_FL; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 3c13f88d8139..a67705694687 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -456,6 +456,7 @@ static int do_read_inode(struct inode *inode) le64_to_cpu(ri->i_compr_blocks)); fi->i_compress_algorithm = ri->i_compress_algorithm; fi->i_log_cluster_size = ri->i_log_cluster_size; + fi->i_compress_flag = le16_to_cpu(ri->i_compress_flag); fi->i_cluster_size = 1 << fi->i_log_cluster_size; set_inode_flag(inode, FI_COMPRESSED_FILE); } @@ -634,6 +635,8 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) &F2FS_I(inode)->i_compr_blocks)); ri->i_compress_algorithm = F2FS_I(inode)->i_compress_algorithm; + ri->i_compress_flag = + cpu_to_le16(F2FS_I(inode)->i_compress_flag); ri->i_log_cluster_size = F2FS_I(inode)->i_log_cluster_size; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a04ad9610d47..47133b85790e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -144,6 +144,7 @@ enum { Opt_compress_algorithm, Opt_compress_log_size, Opt_compress_extension, + Opt_compress_chksum, Opt_atgc, Opt_err, }; @@ -211,6 +212,7 @@ static match_table_t f2fs_tokens = { {Opt_compress_algorithm, "compress_algorithm=%s"}, {Opt_compress_log_size, "compress_log_size=%u"}, {Opt_compress_extension, "compress_extension=%s"}, + {Opt_compress_chksum, "compress_chksum"}, {Opt_atgc, "atgc"}, {Opt_err, NULL}, }; @@ -921,10 +923,14 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) F2FS_OPTION(sbi).compress_ext_cnt++; kfree(name); break; + case Opt_compress_chksum: + F2FS_OPTION(sbi).compress_chksum = true; + break; #else case Opt_compress_algorithm: case Opt_compress_log_size: case Opt_compress_extension: + case Opt_compress_chksum: f2fs_info(sbi, "compression options not supported"); break; #endif @@ -1515,6 +1521,9 @@ static inline void f2fs_show_compress_options(struct seq_file *seq, seq_printf(seq, ",compress_extension=%s", F2FS_OPTION(sbi).extensions[i]); } + + if (F2FS_OPTION(sbi).compress_chksum) + seq_puts(seq, ",compress_chksum"); } static int f2fs_show_options(struct seq_file *seq, struct dentry *root) diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index a5dbb57a687f..7dc2a06cf19a 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -273,7 +273,7 @@ struct f2fs_inode { __le64 i_compr_blocks; /* # of compressed blocks */ __u8 i_compress_algorithm; /* compress algorithm */ __u8 i_log_cluster_size; /* log of cluster size */ - __le16 i_padding; /* padding */ + __le16 i_compress_flag; /* compress flag */ __le32 i_extra_end[0]; /* for attribute size calculation */ } __packed; __le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */ From 18d4311d2f591bab118bbe7c302e0b7bf28e4c4d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 27 Nov 2020 21:20:06 +0800 Subject: [PATCH 123/580] f2fs: fix kbytes written stat for multi-device case For multi-device case, one f2fs image includes multi devices, so it needs to account bytes written of all block devices belong to the image rather than one main block device, fix it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 27 +++++++++++++++++++++++---- fs/f2fs/f2fs.h | 8 +------- fs/f2fs/super.c | 5 +---- fs/f2fs/sysfs.c | 3 ++- 4 files changed, 27 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index e5d2ffa80b56..14ba1519639e 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1385,6 +1385,27 @@ static void commit_checkpoint(struct f2fs_sb_info *sbi, f2fs_submit_merged_write(sbi, META_FLUSH); } +static inline u64 get_sectors_written(struct block_device *bdev) +{ + return bdev->bd_part ? + (u64)part_stat_read(bdev->bd_part, sectors[STAT_WRITE]) : 0; +} + +u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi) +{ + if (f2fs_is_multi_device(sbi)) { + u64 sectors = 0; + int i; + + for (i = 0; i < sbi->s_ndevs; i++) + sectors += get_sectors_written(FDEV(i).bdev); + + return sectors; + } + + return get_sectors_written(sbi->sb->s_bdev); +} + static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); @@ -1395,7 +1416,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) __u32 crc32 = 0; int i; int cp_payload_blks = __cp_payload(sbi); - struct super_block *sb = sbi->sb; struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); u64 kbytes_written; int err; @@ -1490,9 +1510,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Record write statistics in the hot node summary */ kbytes_written = sbi->kbytes_written; - if (sb->s_bdev->bd_part) - kbytes_written += BD_PART_WRITTEN(sbi); - + kbytes_written += (f2fs_get_sectors_written(sbi) - + sbi->sectors_written_start) >> 1; seg_i->journal->info.kbytes_written = cpu_to_le64(kbytes_written); if (__remain_node_summaries(cpc->reason)) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1bdd3ac2540c..9a7cce0432d7 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1607,13 +1607,6 @@ static inline bool f2fs_is_multi_device(struct f2fs_sb_info *sbi) return sbi->s_ndevs > 1; } -/* For write statistics. Suppose sector size is 512 bytes, - * and the return value is in kbytes. s is of struct f2fs_sb_info. - */ -#define BD_PART_WRITTEN(s) \ -(((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[STAT_WRITE]) - \ - (s)->sectors_written_start) >> 1) - static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) { unsigned long now = jiffies; @@ -3392,6 +3385,7 @@ void f2fs_update_dirty_page(struct inode *inode, struct page *page); void f2fs_remove_dirty_inode(struct inode *inode); int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type); void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type); +u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi); int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi); int __init f2fs_create_checkpoint_caches(void); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 47133b85790e..9ab451f5b7bc 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3661,10 +3661,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) } /* For write statistics */ - if (sb->s_bdev->bd_part) - sbi->sectors_written_start = - (u64)part_stat_read(sb->s_bdev->bd_part, - sectors[STAT_WRITE]); + sbi->sectors_written_start = f2fs_get_sectors_written(sbi); /* Read accumulated write IO statistics if exists */ seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 64f7860ba3e4..5d1cb5063d1c 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -97,7 +97,8 @@ static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, return sprintf(buf, "%llu\n", (unsigned long long)(sbi->kbytes_written + - BD_PART_WRITTEN(sbi))); + ((f2fs_get_sectors_written(sbi) - + sbi->sectors_written_start) >> 1))); } static ssize_t features_show(struct f2fs_attr *a, From 65c0faeb261f277ec33be11ef220e31c28393356 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 24 Nov 2020 14:55:47 -0800 Subject: [PATCH 124/580] f2fs: rename logical_to_blk and blk_to_logical This patch renames two functions like below having u64. - logical_to_blk to bytes_to_blks - blk_to_logical to blks_to_bytes Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 6440c297012f..396a4e6ece54 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1759,14 +1759,14 @@ static int get_data_block_bmap(struct inode *inode, sector_t iblock, NO_CHECK_TYPE, create); } -static inline sector_t logical_to_blk(struct inode *inode, loff_t offset) +static inline u64 bytes_to_blks(struct inode *inode, u64 bytes) { - return (offset >> inode->i_blkbits); + return (bytes >> inode->i_blkbits); } -static inline loff_t blk_to_logical(struct inode *inode, sector_t blk) +static inline u64 blks_to_bytes(struct inode *inode, u64 blks) { - return (blk << inode->i_blkbits); + return (blks << inode->i_blkbits); } static int f2fs_xattr_fiemap(struct inode *inode, @@ -1794,7 +1794,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, return err; } - phys = (__u64)blk_to_logical(inode, ni.blk_addr); + phys = blks_to_bytes(inode, ni.blk_addr); offset = offsetof(struct f2fs_inode, i_addr) + sizeof(__le32) * (DEF_ADDRS_PER_INODE - get_inline_xattr_addrs(inode)); @@ -1826,7 +1826,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, return err; } - phys = (__u64)blk_to_logical(inode, ni.blk_addr); + phys = blks_to_bytes(inode, ni.blk_addr); len = inode->i_sb->s_blocksize; f2fs_put_page(page, 1); @@ -1896,18 +1896,18 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, goto out; } - if (logical_to_blk(inode, len) == 0) - len = blk_to_logical(inode, 1); + if (bytes_to_blks(inode, len) == 0) + len = blks_to_bytes(inode, 1); - start_blk = logical_to_blk(inode, start); - last_blk = logical_to_blk(inode, start + len - 1); + start_blk = bytes_to_blks(inode, start); + last_blk = bytes_to_blks(inode, start + len - 1); next: memset(&map_bh, 0, sizeof(struct buffer_head)); map_bh.b_size = len; if (compr_cluster) - map_bh.b_size = blk_to_logical(inode, cluster_size - 1); + map_bh.b_size = blks_to_bytes(inode, cluster_size - 1); ret = get_data_block(inode, start_blk, &map_bh, 0, F2FS_GET_BLOCK_FIEMAP, &next_pgofs); @@ -1918,7 +1918,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (!buffer_mapped(&map_bh)) { start_blk = next_pgofs; - if (blk_to_logical(inode, start_blk) < blk_to_logical(inode, + if (blks_to_bytes(inode, start_blk) < blks_to_bytes(inode, max_inode_blocks(inode))) goto prep_next; @@ -1944,9 +1944,9 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, compr_cluster = false; - logical = blk_to_logical(inode, start_blk - 1); - phys = blk_to_logical(inode, map_bh.b_blocknr); - size = blk_to_logical(inode, cluster_size); + logical = blks_to_bytes(inode, start_blk - 1); + phys = blks_to_bytes(inode, map_bh.b_blocknr); + size = blks_to_bytes(inode, cluster_size); flags |= FIEMAP_EXTENT_ENCODED; @@ -1964,14 +1964,14 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, goto prep_next; } - logical = blk_to_logical(inode, start_blk); - phys = blk_to_logical(inode, map_bh.b_blocknr); + logical = blks_to_bytes(inode, start_blk); + phys = blks_to_bytes(inode, map_bh.b_blocknr); size = map_bh.b_size; flags = 0; if (buffer_unwritten(&map_bh)) flags = FIEMAP_EXTENT_UNWRITTEN; - start_blk += logical_to_blk(inode, size); + start_blk += bytes_to_blks(inode, size); prep_next: cond_resched(); @@ -3857,7 +3857,7 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, * to be very smart. */ cur_lblock = 0; - last_lblock = logical_to_blk(inode, i_size_read(inode)); + last_lblock = bytes_to_blks(inode, i_size_read(inode)); len = i_size_read(inode); while (cur_lblock <= last_lblock && cur_lblock < sis->max) { @@ -3879,7 +3879,7 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, goto err_out; pblock = map_bh.b_blocknr; - nr_pblocks = logical_to_blk(inode, map_bh.b_size); + nr_pblocks = bytes_to_blks(inode, map_bh.b_size); if (cur_lblock + nr_pblocks >= sis->max) nr_pblocks = sis->max - cur_lblock; From 2cc122b26f5f4e80e17e32ba99189dc486ae356e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 24 Nov 2020 15:04:27 -0800 Subject: [PATCH 125/580] f2fs: use new conversion functions between blks and bytes This patch cleans up blks and bytes conversions. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 46 +++++++++++++++++++++------------------------- 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 396a4e6ece54..970fbabbef96 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1701,6 +1701,16 @@ bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len) return true; } +static inline u64 bytes_to_blks(struct inode *inode, u64 bytes) +{ + return (bytes >> inode->i_blkbits); +} + +static inline u64 blks_to_bytes(struct inode *inode, u64 blks) +{ + return (blks << inode->i_blkbits); +} + static int __get_data_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create, int flag, pgoff_t *next_pgofs, int seg_type, bool may_write) @@ -1709,7 +1719,7 @@ static int __get_data_block(struct inode *inode, sector_t iblock, int err; map.m_lblk = iblock; - map.m_len = bh->b_size >> inode->i_blkbits; + map.m_len = bytes_to_blks(inode, bh->b_size); map.m_next_pgofs = next_pgofs; map.m_next_extent = NULL; map.m_seg_type = seg_type; @@ -1719,7 +1729,7 @@ static int __get_data_block(struct inode *inode, sector_t iblock, if (!err) { map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags; - bh->b_size = (u64)map.m_len << inode->i_blkbits; + bh->b_size = blks_to_bytes(inode, map.m_len); } return err; } @@ -1759,16 +1769,6 @@ static int get_data_block_bmap(struct inode *inode, sector_t iblock, NO_CHECK_TYPE, create); } -static inline u64 bytes_to_blks(struct inode *inode, u64 bytes) -{ - return (bytes >> inode->i_blkbits); -} - -static inline u64 blks_to_bytes(struct inode *inode, u64 blks) -{ - return (blks << inode->i_blkbits); -} - static int f2fs_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo) { @@ -2004,8 +2004,7 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page, bool is_readahead) { struct bio *bio = *bio_ret; - const unsigned blkbits = inode->i_blkbits; - const unsigned blocksize = 1 << blkbits; + const unsigned blocksize = blks_to_bytes(inode, 1); sector_t block_in_file; sector_t last_block; sector_t last_block_in_file; @@ -2014,8 +2013,8 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page, block_in_file = (sector_t)page_index(page); last_block = block_in_file + nr_pages; - last_block_in_file = (f2fs_readpage_limit(inode) + blocksize - 1) >> - blkbits; + last_block_in_file = bytes_to_blks(inode, + f2fs_readpage_limit(inode) + blocksize - 1); if (last_block > last_block_in_file) last_block = last_block_in_file; @@ -2127,16 +2126,15 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, struct bio *bio = *bio_ret; unsigned int start_idx = cc->cluster_idx << cc->log_cluster_size; sector_t last_block_in_file; - const unsigned blkbits = inode->i_blkbits; - const unsigned blocksize = 1 << blkbits; + const unsigned blocksize = blks_to_bytes(inode, 1); struct decompress_io_ctx *dic = NULL; int i; int ret = 0; f2fs_bug_on(sbi, f2fs_cluster_is_empty(cc)); - last_block_in_file = (f2fs_readpage_limit(inode) + - blocksize - 1) >> blkbits; + last_block_in_file = bytes_to_blks(inode, + f2fs_readpage_limit(inode) + blocksize - 1); /* get rid of pages beyond EOF */ for (i = 0; i < cc->cluster_size; i++) { @@ -3922,7 +3920,6 @@ static int check_swap_activate(struct swap_info_struct *sis, struct inode *inode = mapping->host; unsigned blocks_per_page; unsigned long page_no; - unsigned blkbits; sector_t probe_block; sector_t last_block; sector_t lowest_block = -1; @@ -3933,8 +3930,7 @@ static int check_swap_activate(struct swap_info_struct *sis, if (PAGE_SIZE == F2FS_BLKSIZE) return check_swap_activate_fast(sis, swap_file, span); - blkbits = inode->i_blkbits; - blocks_per_page = PAGE_SIZE >> blkbits; + blocks_per_page = bytes_to_blks(inode, PAGE_SIZE); /* * Map all the blocks into the extent list. This code doesn't try @@ -3942,7 +3938,7 @@ static int check_swap_activate(struct swap_info_struct *sis, */ probe_block = 0; page_no = 0; - last_block = i_size_read(inode) >> blkbits; + last_block = bytes_to_blks(inode, i_size_read(inode)); while ((probe_block + blocks_per_page) <= last_block && page_no < sis->max) { unsigned block_in_page; @@ -3976,7 +3972,7 @@ static int check_swap_activate(struct swap_info_struct *sis, } } - first_block >>= (PAGE_SHIFT - blkbits); + first_block >>= (PAGE_SHIFT - inode->i_blkbits); if (page_no) { /* exclude the header page */ if (first_block < lowest_block) lowest_block = first_block; From 2e1cde8ee1e109087e7485ac0d06795393d7b697 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 24 Nov 2020 15:09:07 -0800 Subject: [PATCH 126/580] f2fs: fix wrong block count instead of bytes We should convert cur_lblock, a block count, to bytes for len. Fixes: af4b6b8edf6a ("f2fs: introduce check_swap_activate_fast()") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 970fbabbef96..b282b616cab4 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3847,7 +3847,7 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, sector_t highest_pblock = 0; int nr_extents = 0; unsigned long nr_pblocks; - unsigned long len; + u64 len; int ret; /* @@ -3865,7 +3865,7 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, cond_resched(); memset(&map_bh, 0, sizeof(struct buffer_head)); - map_bh.b_size = len - cur_lblock; + map_bh.b_size = len - blks_to_bytes(inode, cur_lblock); ret = get_data_block(inode, cur_lblock, &map_bh, 0, F2FS_GET_BLOCK_FIEMAP, &next_pgofs); From 20520481f182d0a7bab10c10d1d38e8e2e86037d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 24 Nov 2020 15:19:10 -0800 Subject: [PATCH 127/580] f2fs: remove buffer_head which has 32bits limit This patch removes buffer_head dependency when getting block addresses. Light reported there's a 32bit issue in f2fs_fiemap where map_bh.b_size is 32bits while len is 64bits given by user. This will give wrong length to f2fs_map_block. Reported-by: Light Hsieh Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 76 ++++++++++++++++++++++---------------------------- 1 file changed, 34 insertions(+), 42 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b282b616cab4..9fc04052914d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1734,15 +1734,6 @@ static int __get_data_block(struct inode *inode, sector_t iblock, return err; } -static int get_data_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create, int flag, - pgoff_t *next_pgofs) -{ - return __get_data_block(inode, iblock, bh_result, create, - flag, next_pgofs, - NO_CHECK_TYPE, create); -} - static int get_data_block_dio_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { @@ -1761,14 +1752,6 @@ static int get_data_block_dio(struct inode *inode, sector_t iblock, false); } -static int get_data_block_bmap(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_BMAP, NULL, - NO_CHECK_TYPE, create); -} - static int f2fs_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo) { @@ -1864,7 +1847,7 @@ static loff_t max_inode_blocks(struct inode *inode) int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { - struct buffer_head map_bh; + struct f2fs_map_blocks map; sector_t start_blk, last_blk; pgoff_t next_pgofs; u64 logical = 0, phys = 0, size = 0; @@ -1903,19 +1886,21 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, last_blk = bytes_to_blks(inode, start + len - 1); next: - memset(&map_bh, 0, sizeof(struct buffer_head)); - map_bh.b_size = len; + memset(&map, 0, sizeof(map)); + map.m_lblk = start_blk; + map.m_len = bytes_to_blks(inode, len); + map.m_next_pgofs = &next_pgofs; + map.m_seg_type = NO_CHECK_TYPE; if (compr_cluster) - map_bh.b_size = blks_to_bytes(inode, cluster_size - 1); + map.m_len = cluster_size - 1; - ret = get_data_block(inode, start_blk, &map_bh, 0, - F2FS_GET_BLOCK_FIEMAP, &next_pgofs); + ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP); if (ret) goto out; /* HOLE */ - if (!buffer_mapped(&map_bh)) { + if (!(map.m_flags & F2FS_MAP_FLAGS)) { start_blk = next_pgofs; if (blks_to_bytes(inode, start_blk) < blks_to_bytes(inode, @@ -1945,7 +1930,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, logical = blks_to_bytes(inode, start_blk - 1); - phys = blks_to_bytes(inode, map_bh.b_blocknr); + phys = blks_to_bytes(inode, map.m_pblk); size = blks_to_bytes(inode, cluster_size); flags |= FIEMAP_EXTENT_ENCODED; @@ -1958,17 +1943,17 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, goto prep_next; } - if (map_bh.b_blocknr == COMPRESS_ADDR) { + if (map.m_pblk == COMPRESS_ADDR) { compr_cluster = true; start_blk++; goto prep_next; } logical = blks_to_bytes(inode, start_blk); - phys = blks_to_bytes(inode, map_bh.b_blocknr); - size = map_bh.b_size; + phys = blks_to_bytes(inode, map.m_pblk); + size = blks_to_bytes(inode, map.m_len); flags = 0; - if (buffer_unwritten(&map_bh)) + if (map.m_flags & F2FS_MAP_UNWRITTEN) flags = FIEMAP_EXTENT_UNWRITTEN; start_blk += bytes_to_blks(inode, size); @@ -3751,9 +3736,6 @@ static sector_t f2fs_bmap_compress(struct inode *inode, sector_t block) static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; - struct buffer_head tmp = { - .b_size = i_blocksize(inode), - }; sector_t blknr = 0; if (f2fs_has_inline_data(inode)) @@ -3770,8 +3752,16 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) if (f2fs_compressed_file(inode)) { blknr = f2fs_bmap_compress(inode, block); } else { - if (!get_data_block_bmap(inode, block, &tmp, 0)) - blknr = tmp.b_blocknr; + struct f2fs_map_blocks map; + + memset(&map, 0, sizeof(map)); + map.m_lblk = block; + map.m_len = 1; + map.m_next_pgofs = NULL; + map.m_seg_type = NO_CHECK_TYPE; + + if (!f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_BMAP)) + blknr = map.m_pblk; } out: trace_f2fs_bmap(inode, block, blknr); @@ -3859,25 +3849,27 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, len = i_size_read(inode); while (cur_lblock <= last_lblock && cur_lblock < sis->max) { - struct buffer_head map_bh; + struct f2fs_map_blocks map; pgoff_t next_pgofs; cond_resched(); - memset(&map_bh, 0, sizeof(struct buffer_head)); - map_bh.b_size = len - blks_to_bytes(inode, cur_lblock); + memset(&map, 0, sizeof(map)); + map.m_lblk = cur_lblock; + map.m_len = bytes_to_blks(inode, len) - cur_lblock; + map.m_next_pgofs = &next_pgofs; + map.m_seg_type = NO_CHECK_TYPE; - ret = get_data_block(inode, cur_lblock, &map_bh, 0, - F2FS_GET_BLOCK_FIEMAP, &next_pgofs); + ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP); if (ret) goto err_out; /* hole */ - if (!buffer_mapped(&map_bh)) + if (!(map.m_flags & F2FS_MAP_FLAGS)) goto err_out; - pblock = map_bh.b_blocknr; - nr_pblocks = bytes_to_blks(inode, map_bh.b_size); + pblock = map.m_pblk; + nr_pblocks = map.m_len; if (cur_lblock + nr_pblocks >= sis->max) nr_pblocks = sis->max - cur_lblock; From 8395710b1228bf61ac06c1e86902d202c8351c5a Mon Sep 17 00:00:00 2001 From: Jack Qiu Date: Tue, 1 Dec 2020 15:45:47 +0800 Subject: [PATCH 128/580] f2fs: init dirty_secmap incorrectly section is dirty, but dirty_secmap may not set Reported-by: Jia Yang Fixes: da52f8ade40b ("f2fs: get the right gc victim section when section has several segments") Cc: Signed-off-by: Jack Qiu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f73e8ea1961c..1f4cf69f8ba1 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -4550,7 +4550,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) return; mutex_lock(&dirty_i->seglist_lock); - for (segno = 0; segno < MAIN_SECS(sbi); segno += blks_per_sec) { + for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { valid_blocks = get_valid_blocks(sbi, segno, true); secno = GET_SEC_FROM_SEG(sbi, segno); From 4d0706dfea1e14a6be175b3d0a3f2e83ec3df193 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 1 Dec 2020 15:17:39 +0800 Subject: [PATCH 129/580] f2fs: Remove unnecessary unlikely() WARN_ON() already contains an unlikely(), so it's not necessary to use unlikely. Signed-off-by: Yangtao Li Signed-off-by: Shuosheng Huang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9a7cce0432d7..f6bf74dd3d33 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -32,10 +32,8 @@ #else #define f2fs_bug_on(sbi, condition) \ do { \ - if (unlikely(condition)) { \ - WARN_ON(1); \ + if (WARN_ON(condition)) \ set_sbi_flag(sbi, SBI_NEED_FSCK); \ - } \ } while (0) #endif From a864559a7783bce2b6f0515a344b69a2d44ed242 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Tue, 1 Dec 2020 13:08:02 +0900 Subject: [PATCH 130/580] f2fs: add compress_mode mount option We will add a new "compress_mode" mount option to control file compression mode. This supports "fs" and "user". In "fs" mode (default), f2fs does automatic compression on the compression enabled files. In "user" mode, f2fs disables the automaic compression and gives the user discretion of choosing the target file and the timing. It means the user can do manual compression/decompression on the compression enabled files using ioctls. Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 2 +- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 30 ++++++++++++++++++++++++++++++ fs/f2fs/segment.c | 2 +- fs/f2fs/super.c | 23 +++++++++++++++++++++++ 5 files changed, 56 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 58db1ff6f3d5..f456d972ae90 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -915,7 +915,7 @@ int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index) static bool cluster_may_compress(struct compress_ctx *cc) { - if (!f2fs_compressed_file(cc->inode)) + if (!f2fs_need_compress_data(cc->inode)) return false; if (f2fs_is_atomic_file(cc->inode)) return false; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9fc04052914d..56324b4d8ce9 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3101,7 +3101,7 @@ static inline bool __should_serialize_io(struct inode *inode, if (IS_NOQUOTA(inode)) return false; - if (f2fs_compressed_file(inode)) + if (f2fs_need_compress_data(inode)) return true; if (wbc->sync_mode != WB_SYNC_ALL) return true; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f6bf74dd3d33..e7ef34861720 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -147,6 +147,7 @@ struct f2fs_mount_info { unsigned char compress_log_size; /* cluster log size */ bool compress_chksum; /* compressed data chksum */ unsigned char compress_ext_cnt; /* extension count */ + int compress_mode; /* compression mode */ unsigned char extensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */ }; @@ -676,6 +677,7 @@ enum { FI_COMPRESSED_FILE, /* indicate file's data can be compressed */ FI_COMPRESS_CORRUPT, /* indicate compressed cluster is corrupted */ FI_MMAP_FILE, /* indicate file was mmapped */ + FI_ENABLE_COMPRESS, /* enable compression in "user" compression mode */ FI_MAX, /* max flag, never be used */ }; @@ -1242,6 +1244,18 @@ enum fsync_mode { FSYNC_MODE_NOBARRIER, /* fsync behaves nobarrier based on posix */ }; +enum { + COMPR_MODE_FS, /* + * automatically compress compression + * enabled files + */ + COMPR_MODE_USER, /* + * automatical compression is disabled. + * user can control the file compression + * using ioctls + */ +}; + /* * this value is set in page as a private data which indicate that * the page is atomically written, and it is in inmem_pages list. @@ -2764,6 +2778,22 @@ static inline int f2fs_compressed_file(struct inode *inode) is_inode_flag_set(inode, FI_COMPRESSED_FILE); } +static inline bool f2fs_need_compress_data(struct inode *inode) +{ + int compress_mode = F2FS_OPTION(F2FS_I_SB(inode)).compress_mode; + + if (!f2fs_compressed_file(inode)) + return false; + + if (compress_mode == COMPR_MODE_FS) + return true; + else if (compress_mode == COMPR_MODE_USER && + is_inode_flag_set(inode, FI_ENABLE_COMPRESS)) + return true; + + return false; +} + static inline unsigned int addrs_per_inode(struct inode *inode) { unsigned int addrs = CUR_ADDRS_PER_INODE(inode) - diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1f4cf69f8ba1..6ec659bc8d8d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3260,7 +3260,7 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) else return CURSEG_COLD_DATA; } - if (file_is_cold(inode) || f2fs_compressed_file(inode)) + if (file_is_cold(inode) || f2fs_need_compress_data(inode)) return CURSEG_COLD_DATA; if (file_is_hot(inode) || is_inode_flag_set(inode, FI_HOT_DATA) || diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 9ab451f5b7bc..5e79ba4ac247 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -145,6 +145,7 @@ enum { Opt_compress_log_size, Opt_compress_extension, Opt_compress_chksum, + Opt_compress_mode, Opt_atgc, Opt_err, }; @@ -213,6 +214,7 @@ static match_table_t f2fs_tokens = { {Opt_compress_log_size, "compress_log_size=%u"}, {Opt_compress_extension, "compress_extension=%s"}, {Opt_compress_chksum, "compress_chksum"}, + {Opt_compress_mode, "compress_mode=%s"}, {Opt_atgc, "atgc"}, {Opt_err, NULL}, }; @@ -926,11 +928,26 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) case Opt_compress_chksum: F2FS_OPTION(sbi).compress_chksum = true; break; + case Opt_compress_mode: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + if (!strcmp(name, "fs")) { + F2FS_OPTION(sbi).compress_mode = COMPR_MODE_FS; + } else if (!strcmp(name, "user")) { + F2FS_OPTION(sbi).compress_mode = COMPR_MODE_USER; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; #else case Opt_compress_algorithm: case Opt_compress_log_size: case Opt_compress_extension: case Opt_compress_chksum: + case Opt_compress_mode: f2fs_info(sbi, "compression options not supported"); break; #endif @@ -1524,6 +1541,11 @@ static inline void f2fs_show_compress_options(struct seq_file *seq, if (F2FS_OPTION(sbi).compress_chksum) seq_puts(seq, ",compress_chksum"); + + if (F2FS_OPTION(sbi).compress_mode == COMPR_MODE_FS) + seq_printf(seq, ",compress_mode=%s", "fs"); + else if (F2FS_OPTION(sbi).compress_mode == COMPR_MODE_USER) + seq_printf(seq, ",compress_mode=%s", "user"); } static int f2fs_show_options(struct seq_file *seq, struct dentry *root) @@ -1670,6 +1692,7 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4; F2FS_OPTION(sbi).compress_log_size = MIN_COMPRESS_LOG_SIZE; F2FS_OPTION(sbi).compress_ext_cnt = 0; + F2FS_OPTION(sbi).compress_mode = COMPR_MODE_FS; F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON; set_opt(sbi, INLINE_XATTR); From ac5197d683e64f59179efd46b97b531bd5c389ba Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Thu, 3 Dec 2020 15:56:15 +0900 Subject: [PATCH 131/580] f2fs: add F2FS_IOC_DECOMPRESS_FILE and F2FS_IOC_COMPRESS_FILE Added two ioctl to decompress/compress explicitly the compression enabled file in "compress_mode=user" mount option. Using these two ioctls, the users can make a control of compression and decompression of their files. Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 182 ++++++++++++++++++++++++++++++++++++++ include/uapi/linux/f2fs.h | 2 + 2 files changed, 184 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index d50b15488313..84d4043103ce 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4048,6 +4048,182 @@ static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg) return ret; } +static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + pgoff_t redirty_idx = page_idx; + int i, page_len = 0, ret = 0; + + for (i = 0; i < len; i++, page_idx++) { + page = read_cache_page(mapping, page_idx, NULL, NULL); + if (IS_ERR(page)) { + ret = PTR_ERR(page); + break; + } + page_len++; + } + + for (i = 0; i < page_len; i++, redirty_idx++) { + page = find_lock_page(mapping, redirty_idx); + if (!page) + ret = -ENOENT; + set_page_dirty(page); + f2fs_put_page(page, 1); + f2fs_put_page(page, 0); + } + + return ret; +} + +static int f2fs_ioc_decompress_file(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + pgoff_t page_idx = 0, last_idx; + unsigned int blk_per_seg = sbi->blocks_per_seg; + int cluster_size = F2FS_I(inode)->i_cluster_size; + int count, ret; + + if (!f2fs_sb_has_compression(sbi) || + F2FS_OPTION(sbi).compress_mode != COMPR_MODE_USER) + return -EOPNOTSUPP; + + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + if (!f2fs_compressed_file(inode)) + return -EINVAL; + + f2fs_balance_fs(F2FS_I_SB(inode), true); + + file_start_write(filp); + inode_lock(inode); + + if (!f2fs_is_compress_backend_ready(inode)) { + ret = -EOPNOTSUPP; + goto out; + } + + if (f2fs_is_mmap_file(inode)) { + ret = -EBUSY; + goto out; + } + + ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (ret) + goto out; + + if (!atomic_read(&fi->i_compr_blocks)) + goto out; + + last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + + count = last_idx - page_idx; + while (count) { + int len = min(cluster_size, count); + + ret = redirty_blocks(inode, page_idx, len); + if (ret < 0) + break; + + if (get_dirty_pages(inode) >= blk_per_seg) + filemap_fdatawrite(inode->i_mapping); + + count -= len; + page_idx += len; + } + + if (!ret) + ret = filemap_write_and_wait_range(inode->i_mapping, 0, + LLONG_MAX); + + if (ret) + f2fs_warn(sbi, "%s: The file might be partially decompressed " + "(errno=%d). Please delete the file.\n", + __func__, ret); +out: + inode_unlock(inode); + file_end_write(filp); + + return ret; +} + +static int f2fs_ioc_compress_file(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + pgoff_t page_idx = 0, last_idx; + unsigned int blk_per_seg = sbi->blocks_per_seg; + int cluster_size = F2FS_I(inode)->i_cluster_size; + int count, ret; + + if (!f2fs_sb_has_compression(sbi) || + F2FS_OPTION(sbi).compress_mode != COMPR_MODE_USER) + return -EOPNOTSUPP; + + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + if (!f2fs_compressed_file(inode)) + return -EINVAL; + + f2fs_balance_fs(F2FS_I_SB(inode), true); + + file_start_write(filp); + inode_lock(inode); + + if (!f2fs_is_compress_backend_ready(inode)) { + ret = -EOPNOTSUPP; + goto out; + } + + if (f2fs_is_mmap_file(inode)) { + ret = -EBUSY; + goto out; + } + + ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (ret) + goto out; + + set_inode_flag(inode, FI_ENABLE_COMPRESS); + + last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + + count = last_idx - page_idx; + while (count) { + int len = min(cluster_size, count); + + ret = redirty_blocks(inode, page_idx, len); + if (ret < 0) + break; + + if (get_dirty_pages(inode) >= blk_per_seg) + filemap_fdatawrite(inode->i_mapping); + + count -= len; + page_idx += len; + } + + if (!ret) + ret = filemap_write_and_wait_range(inode->i_mapping, 0, + LLONG_MAX); + + clear_inode_flag(inode, FI_ENABLE_COMPRESS); + + if (ret) + f2fs_warn(sbi, "%s: The file might be partially compressed " + "(errno=%d). Please delete the file.\n", + __func__, ret); +out: + inode_unlock(inode); + file_end_write(filp); + + return ret; +} + static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { @@ -4135,6 +4311,10 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_get_compress_option(filp, arg); case F2FS_IOC_SET_COMPRESS_OPTION: return f2fs_ioc_set_compress_option(filp, arg); + case F2FS_IOC_DECOMPRESS_FILE: + return f2fs_ioc_decompress_file(filp, arg); + case F2FS_IOC_COMPRESS_FILE: + return f2fs_ioc_compress_file(filp, arg); default: return -ENOTTY; } @@ -4373,6 +4553,8 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_SEC_TRIM_FILE: case F2FS_IOC_GET_COMPRESS_OPTION: case F2FS_IOC_SET_COMPRESS_OPTION: + case F2FS_IOC_DECOMPRESS_FILE: + case F2FS_IOC_COMPRESS_FILE: break; default: return -ENOIOCTLCMD; diff --git a/include/uapi/linux/f2fs.h b/include/uapi/linux/f2fs.h index f00199a2e38b..352a822d4370 100644 --- a/include/uapi/linux/f2fs.h +++ b/include/uapi/linux/f2fs.h @@ -40,6 +40,8 @@ struct f2fs_comp_option) #define F2FS_IOC_SET_COMPRESS_OPTION _IOW(F2FS_IOCTL_MAGIC, 22, \ struct f2fs_comp_option) +#define F2FS_IOC_DECOMPRESS_FILE _IO(F2FS_IOCTL_MAGIC, 23) +#define F2FS_IOC_COMPRESS_FILE _IO(F2FS_IOCTL_MAGIC, 24) /* * should be same as XFS_IOC_GOINGDOWN. From ebc7b1b6b0db2abb90d6c1964da3393424b2c6a8 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 6 Nov 2020 13:22:05 -0800 Subject: [PATCH 132/580] f2fs: avoid race condition for shrinker count Light reported sometimes shinker gets nat_cnt < dirty_nat_cnt resulting in wrong do_shinker work. Let's avoid to return insane overflowed value by adding single tracking value. Reported-by: Light Hsieh Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/debug.c | 11 ++++++----- fs/f2fs/f2fs.h | 10 ++++++++-- fs/f2fs/node.c | 29 ++++++++++++++++++----------- fs/f2fs/node.h | 4 ++-- fs/f2fs/shrinker.c | 4 +--- 6 files changed, 36 insertions(+), 24 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 14ba1519639e..617d0f6b0836 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1619,7 +1619,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) goto out; } - if (NM_I(sbi)->dirty_nat_cnt == 0 && + if (NM_I(sbi)->nat_cnt[DIRTY_NAT] == 0 && SIT_I(sbi)->dirty_sentries == 0 && prefree_segments(sbi) == 0) { f2fs_flush_sit_entries(sbi, cpc); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index a8357fd4f5fa..197c914119da 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -145,8 +145,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->node_pages = NODE_MAPPING(sbi)->nrpages; if (sbi->meta_inode) si->meta_pages = META_MAPPING(sbi)->nrpages; - si->nats = NM_I(sbi)->nat_cnt; - si->dirty_nats = NM_I(sbi)->dirty_nat_cnt; + si->nats = NM_I(sbi)->nat_cnt[TOTAL_NAT]; + si->dirty_nats = NM_I(sbi)->nat_cnt[DIRTY_NAT]; si->sits = MAIN_SEGS(sbi); si->dirty_sits = SIT_I(sbi)->dirty_sentries; si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID]; @@ -278,9 +278,10 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID] + NM_I(sbi)->nid_cnt[PREALLOC_NID]) * sizeof(struct free_nid); - si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry); - si->cache_mem += NM_I(sbi)->dirty_nat_cnt * - sizeof(struct nat_entry_set); + si->cache_mem += NM_I(sbi)->nat_cnt[TOTAL_NAT] * + sizeof(struct nat_entry); + si->cache_mem += NM_I(sbi)->nat_cnt[DIRTY_NAT] * + sizeof(struct nat_entry_set); si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages); for (i = 0; i < MAX_INO_ENTRY; i++) si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e7ef34861720..84b6a30f5853 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -819,6 +819,13 @@ enum nid_state { MAX_NID_STATE, }; +enum nat_state { + TOTAL_NAT, + DIRTY_NAT, + RECLAIMABLE_NAT, + MAX_NAT_STATE, +}; + struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ @@ -834,8 +841,7 @@ struct f2fs_nm_info { struct rw_semaphore nat_tree_lock; /* protect nat_tree_lock */ struct list_head nat_entries; /* cached nat entry list (clean) */ spinlock_t nat_list_lock; /* protect clean nat entry list */ - unsigned int nat_cnt; /* the # of cached nat entries */ - unsigned int dirty_nat_cnt; /* total num of nat entries in set */ + unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */ unsigned int nat_blocks; /* # of nat blocks */ /* free node ids management */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index cfab2447059c..e5f9ff84f0d5 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -62,8 +62,8 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) sizeof(struct free_nid)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == NAT_ENTRIES) { - mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> - PAGE_SHIFT; + mem_size = (nm_i->nat_cnt[TOTAL_NAT] * + sizeof(struct nat_entry)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); if (excess_cached_nats(sbi)) res = false; @@ -177,7 +177,8 @@ static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i, list_add_tail(&ne->list, &nm_i->nat_entries); spin_unlock(&nm_i->nat_list_lock); - nm_i->nat_cnt++; + nm_i->nat_cnt[TOTAL_NAT]++; + nm_i->nat_cnt[RECLAIMABLE_NAT]++; return ne; } @@ -207,7 +208,8 @@ static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i, static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e) { radix_tree_delete(&nm_i->nat_root, nat_get_nid(e)); - nm_i->nat_cnt--; + nm_i->nat_cnt[TOTAL_NAT]--; + nm_i->nat_cnt[RECLAIMABLE_NAT]--; __free_nat_entry(e); } @@ -253,7 +255,8 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, if (get_nat_flag(ne, IS_DIRTY)) goto refresh_list; - nm_i->dirty_nat_cnt++; + nm_i->nat_cnt[DIRTY_NAT]++; + nm_i->nat_cnt[RECLAIMABLE_NAT]--; set_nat_flag(ne, IS_DIRTY, true); refresh_list: spin_lock(&nm_i->nat_list_lock); @@ -273,7 +276,8 @@ static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i, set_nat_flag(ne, IS_DIRTY, false); set->entry_cnt--; - nm_i->dirty_nat_cnt--; + nm_i->nat_cnt[DIRTY_NAT]--; + nm_i->nat_cnt[RECLAIMABLE_NAT]++; } static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, @@ -2944,14 +2948,17 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) LIST_HEAD(sets); int err = 0; - /* during unmount, let's flush nat_bits before checking dirty_nat_cnt */ + /* + * during unmount, let's flush nat_bits before checking + * nat_cnt[DIRTY_NAT]. + */ if (enabled_nat_bits(sbi, cpc)) { down_write(&nm_i->nat_tree_lock); remove_nats_in_journal(sbi); up_write(&nm_i->nat_tree_lock); } - if (!nm_i->dirty_nat_cnt) + if (!nm_i->nat_cnt[DIRTY_NAT]) return 0; down_write(&nm_i->nat_tree_lock); @@ -2962,7 +2969,8 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * into nat entry set. */ if (enabled_nat_bits(sbi, cpc) || - !__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL)) + !__has_cursum_space(journal, + nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL)) remove_nats_in_journal(sbi); while ((found = __gang_lookup_nat_set(nm_i, @@ -3086,7 +3094,6 @@ static int init_node_manager(struct f2fs_sb_info *sbi) F2FS_RESERVED_NODE_NUM; nm_i->nid_cnt[FREE_NID] = 0; nm_i->nid_cnt[PREALLOC_NID] = 0; - nm_i->nat_cnt = 0; nm_i->ram_thresh = DEF_RAM_THRESHOLD; nm_i->ra_nid_pages = DEF_RA_NID_PAGES; nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD; @@ -3220,7 +3227,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) __del_from_nat_cache(nm_i, natvec[idx]); } } - f2fs_bug_on(sbi, nm_i->nat_cnt); + f2fs_bug_on(sbi, nm_i->nat_cnt[TOTAL_NAT]); /* destroy nat set cache */ nid = 0; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 69e5859e993c..f84541b57acb 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -126,13 +126,13 @@ static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne, static inline bool excess_dirty_nats(struct f2fs_sb_info *sbi) { - return NM_I(sbi)->dirty_nat_cnt >= NM_I(sbi)->max_nid * + return NM_I(sbi)->nat_cnt[DIRTY_NAT] >= NM_I(sbi)->max_nid * NM_I(sbi)->dirty_nats_ratio / 100; } static inline bool excess_cached_nats(struct f2fs_sb_info *sbi) { - return NM_I(sbi)->nat_cnt >= DEF_NAT_CACHE_THRESHOLD; + return NM_I(sbi)->nat_cnt[TOTAL_NAT] >= DEF_NAT_CACHE_THRESHOLD; } static inline bool excess_dirty_nodes(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index d66de5999a26..dd3c3c7a90ec 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -18,9 +18,7 @@ static unsigned int shrinker_run_no; static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) { - long count = NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt; - - return count > 0 ? count : 0; + return NM_I(sbi)->nat_cnt[RECLAIMABLE_NAT]; } static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) From 6de4624a989fe49f2eaf240714dea777c198dd5c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 3 Dec 2020 09:14:28 -0800 Subject: [PATCH 133/580] f2fs: don't allow any writes on readonly mount generic_make_request: Trying to write to read-only block-device dm-5 (partno 0) WARNING: CPU: 7 PID: 546 at block/blk-core.c:2190 generic_make_request_checks+0x664/0x690 pc : generic_make_request_checks+0x664/0x690 lr : generic_make_request_checks+0x664/0x690 Call trace: generic_make_request_checks+0x664/0x690 generic_make_request+0xf0/0x3a4 submit_bio+0x80/0x250 __submit_merged_bio+0x368/0x4e0 __submit_merged_write_cond.llvm.12294350193007536502+0xe0/0x3e8 f2fs_wait_on_page_writeback+0x84/0x128 f2fs_convert_inline_page+0x35c/0x6f8 f2fs_convert_inline_inode+0xe0/0x2e0 f2fs_file_mmap+0x48/0x9c mmap_region+0x41c/0x74c do_mmap+0x40c/0x4fc vm_mmap_pgoff+0xb8/0x114 vm_mmap+0x34/0x48 elf_map+0x68/0x108 load_elf_binary+0x538/0xb70 search_binary_handler+0xac/0x1dc exec_binprm+0x50/0x15c __do_execve_file+0x620/0x740 __arm64_sys_execve+0x54/0x68 el0_svc_common+0x9c/0x168 el0_svc_handler+0x60/0x6c el0_svc+0x8/0xc Signed-off-by: Jaegeuk Kim --- fs/f2fs/inline.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 6b45c0acdd27..2f58d1c11dba 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -187,7 +187,8 @@ int f2fs_convert_inline_inode(struct inode *inode) struct page *ipage, *page; int err = 0; - if (!f2fs_has_inline_data(inode)) + if (!f2fs_has_inline_data(inode) || + f2fs_hw_is_readonly(sbi) || f2fs_readonly(sbi->sb)) return 0; page = f2fs_grab_cache_page(inode->i_mapping, 0, false); From 9e0fd7d50596d3a3cf0ca4c23c26bc61d10c8011 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 3 Dec 2020 09:52:45 -0800 Subject: [PATCH 134/580] f2fs: introduce max_io_bytes, a sysfs entry, to limit bio size This patch adds max_io_bytes to limit bio size when f2fs tries to merge consecutive IOs. This can give a testing point to split out bios and check end_io handles those bios correctly. This is used to capture a recent bug on the decompression and fsverity flow. Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 7 +++++++ fs/f2fs/data.c | 3 +++ fs/f2fs/f2fs.h | 1 + fs/f2fs/sysfs.c | 2 ++ 4 files changed, 13 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 834d0becae6d..2618587ab55e 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -350,3 +350,10 @@ Date: April 2020 Contact: "Daeho Jeong" Description: Give a way to change iostat_period time. 3secs by default. The new iostat trace gives stats gap given the period. +What: /sys/fs/f2fs//max_io_bytes +Date: December 2020 +Contact: "Jaegeuk Kim" +Description: This gives a control to limit the bio size in f2fs. + Default is zero, which will follow underlying block layer limit, + whereas, if it has a certain bytes value, f2fs won't submit a + bio larger than that size. diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 56324b4d8ce9..81577dd9e680 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -704,6 +704,9 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) static bool page_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio, block_t last_blkaddr, block_t cur_blkaddr) { + if (unlikely(sbi->max_io_bytes && + bio->bi_iter.bi_size >= sbi->max_io_bytes)) + return false; if (last_blkaddr + 1 != cur_blkaddr) return false; return __same_bdev(sbi, cur_blkaddr, bio); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 84b6a30f5853..1d52018101c8 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1450,6 +1450,7 @@ struct f2fs_sb_info { loff_t max_file_blocks; /* max block index of file */ int dir_level; /* directory level */ int readdir_ra; /* readahead inode in readdir */ + u64 max_io_bytes; /* max io bytes to merge IOs */ block_t user_block_count; /* # of user blocks */ block_t total_valid_block_count; /* # of valid blocks */ diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 5d1cb5063d1c..db9c49f9f0de 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -567,6 +567,7 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_period_ms, iostat_period_ms); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_io_bytes, max_io_bytes); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold); F2FS_RW_ATTR(F2FS_SBI, f2fs_super_block, extension_list, extension_list); #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -651,6 +652,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(iostat_enable), ATTR_LIST(iostat_period_ms), ATTR_LIST(readdir_ra), + ATTR_LIST(max_io_bytes), ATTR_LIST(gc_pin_file_thresh), ATTR_LIST(extension_list), #ifdef CONFIG_F2FS_FAULT_INJECTION From 905b8ad9629e84d6264be5d5a1c53206d121f98e Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Mon, 7 Dec 2020 18:59:33 +0800 Subject: [PATCH 135/580] f2fs: convert to F2FS_*_INO macro Use F2FS_ROOT_INO, F2FS_NODE_INO and F2FS_META_INO macro for better code readability. Signed-off-by: Yangtao Li Signed-off-by: Shaohua Liu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 5e79ba4ac247..15674b2602ea 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3069,9 +3069,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->total_node_count = (le32_to_cpu(raw_super->segment_count_nat) / 2) * sbi->blocks_per_seg * NAT_ENTRY_PER_BLOCK; - sbi->root_ino_num = le32_to_cpu(raw_super->root_ino); - sbi->node_ino_num = le32_to_cpu(raw_super->node_ino); - sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino); + F2FS_ROOT_INO(sbi) = le32_to_cpu(raw_super->root_ino); + F2FS_NODE_INO(sbi) = le32_to_cpu(raw_super->node_ino); + F2FS_META_INO(sbi) = le32_to_cpu(raw_super->meta_ino); sbi->cur_victim_sec = NULL_SECNO; sbi->next_victim_seg[BG_GC] = NULL_SEGNO; sbi->next_victim_seg[FG_GC] = NULL_SEGNO; From ab6f84722537b653d6f3ce9834cab97e5845598b Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Mon, 7 Dec 2020 18:59:34 +0800 Subject: [PATCH 136/580] f2fs: don't check PAGE_SIZE again in sanity_check_raw_super() Many flash devices read and write a single IO based on a multiple of 4KB, and we support only 4KB page cache size now. Since we already check page size in init_f2fs_fs(), so remove page size check in sanity_check_raw_super(). Signed-off-by: Yangtao Li Signed-off-by: Shaohua Liu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 15674b2602ea..7af4d8a499bd 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2768,13 +2768,6 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, } } - /* Currently, support only 4KB page cache size */ - if (F2FS_BLKSIZE != PAGE_SIZE) { - f2fs_info(sbi, "Invalid page_cache_size (%lu), supports only 4KB", - PAGE_SIZE); - return -EFSCORRUPTED; - } - /* Currently, support only 4KB block size */ blocksize = 1 << le32_to_cpu(raw_super->log_blocksize); if (blocksize != F2FS_BLKSIZE) { From 81dda15a3e1c35cdf96c3c237cfe1735da38fea9 Mon Sep 17 00:00:00 2001 From: Jack Qiu Date: Mon, 7 Dec 2020 20:01:12 +0800 Subject: [PATCH 137/580] f2fs: inline: correct comment in f2fs_recover_inline_data In 3rd scene, it should remove data blocks instead of inline_data. Signed-off-by: Jack Qiu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 2f58d1c11dba..f2198fd484be 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -266,7 +266,7 @@ int f2fs_recover_inline_data(struct inode *inode, struct page *npage) * [prev.] [next] of inline_data flag * o o -> recover inline_data * o x -> remove inline_data, and then recover data blocks - * x o -> remove inline_data, and then recover inline_data + * x o -> remove data blocks, and then recover inline_data * x x -> recover data blocks */ if (IS_INODE(npage)) From d3c883ee8acf5503d0ba236b38b72aa73354cac6 Mon Sep 17 00:00:00 2001 From: Jack Qiu Date: Mon, 7 Dec 2020 20:01:14 +0800 Subject: [PATCH 138/580] f2fs: inline: fix wrong inline inode stat Miss to stat inline inode in f2fs_recover_inline_data. Signed-off-by: Jack Qiu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inline.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index f2198fd484be..6cd739114c9d 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -298,6 +298,7 @@ int f2fs_recover_inline_data(struct inode *inode, struct page *npage) if (IS_ERR(ipage)) return PTR_ERR(ipage); f2fs_truncate_inline_inode(inode, ipage, 0); + stat_dec_inline_inode(inode); clear_inode_flag(inode, FI_INLINE_DATA); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { @@ -306,6 +307,7 @@ int f2fs_recover_inline_data(struct inode *inode, struct page *npage) ret = f2fs_truncate_blocks(inode, 0, false); if (ret) return ret; + stat_inc_inline_inode(inode); goto process_inline; } return 0; From 8051912612f96912e3d778977d4df1cbf256e436 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 7 Dec 2020 17:54:41 +0800 Subject: [PATCH 139/580] f2fs: fix to account inline xattr correctly during recovery During recovery, we may missed to update inline xattr count correctly, fix it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index e5f9ff84f0d5..8861838f49c9 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2594,9 +2594,15 @@ int f2fs_recover_inline_xattr(struct inode *inode, struct page *page) ri = F2FS_INODE(page); if (ri->i_inline & F2FS_INLINE_XATTR) { - set_inode_flag(inode, FI_INLINE_XATTR); + if (!f2fs_has_inline_xattr(inode)) { + set_inode_flag(inode, FI_INLINE_XATTR); + stat_inc_inline_xattr(inode); + } } else { - clear_inode_flag(inode, FI_INLINE_XATTR); + if (f2fs_has_inline_xattr(inode)) { + stat_dec_inline_xattr(inode); + clear_inode_flag(inode, FI_INLINE_XATTR); + } goto update_inode; } From 476fbc9cdbe21f77b9edebfd25744be479af756a Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Sat, 5 Dec 2020 13:26:26 +0900 Subject: [PATCH 140/580] f2fs: fix race of pending_pages in decompression I found out f2fs_free_dic() is invoked in a wrong timing, but f2fs_verify_bio() still needed the dic info and it triggered the below kernel panic. It has been caused by the race condition of pending_pages value between decompression and verity logic, when the same compression cluster had been split in different bios. By split bios, f2fs_verify_bio() ended up with decreasing pending_pages value before it is reset to nr_cpages by f2fs_decompress_pages() and caused the kernel panic. [ 4416.564763] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 ... [ 4416.896016] Workqueue: fsverity_read_queue f2fs_verity_work [ 4416.908515] pc : fsverity_verify_page+0x20/0x78 [ 4416.913721] lr : f2fs_verify_bio+0x11c/0x29c [ 4416.913722] sp : ffffffc019533cd0 [ 4416.913723] x29: ffffffc019533cd0 x28: 0000000000000402 [ 4416.913724] x27: 0000000000000001 x26: 0000000000000100 [ 4416.913726] x25: 0000000000000001 x24: 0000000000000004 [ 4416.913727] x23: 0000000000001000 x22: 0000000000000000 [ 4416.913728] x21: 0000000000000000 x20: ffffffff2076f9c0 [ 4416.913729] x19: ffffffff2076f9c0 x18: ffffff8a32380c30 [ 4416.913731] x17: ffffffc01f966d97 x16: 0000000000000298 [ 4416.913732] x15: 0000000000000000 x14: 0000000000000000 [ 4416.913733] x13: f074faec89ffffff x12: 0000000000000000 [ 4416.913734] x11: 0000000000001000 x10: 0000000000001000 [ 4416.929176] x9 : ffffffff20d1f5c7 x8 : 0000000000000000 [ 4416.929178] x7 : 626d7464ff286b6b x6 : ffffffc019533ade [ 4416.929179] x5 : 000000008049000e x4 : ffffffff2793e9e0 [ 4416.929180] x3 : 000000008049000e x2 : ffffff89ecfa74d0 [ 4416.929181] x1 : 0000000000000c40 x0 : ffffffff2076f9c0 [ 4416.929184] Call trace: [ 4416.929187] fsverity_verify_page+0x20/0x78 [ 4416.929189] f2fs_verify_bio+0x11c/0x29c [ 4416.929192] f2fs_verity_work+0x58/0x84 [ 4417.050667] process_one_work+0x270/0x47c [ 4417.055354] worker_thread+0x27c/0x4d8 [ 4417.059784] kthread+0x13c/0x320 [ 4417.063693] ret_from_fork+0x10/0x18 Chao pointed this can happen by the below race condition. Thread A f2fs_post_read_wq fsverity_wq - f2fs_read_multi_pages() - f2fs_alloc_dic - dic->pending_pages = 2 - submit_bio() - submit_bio() - f2fs_post_read_work() handle first bio - f2fs_decompress_work() - __read_end_io() - f2fs_decompress_pages() - dic->pending_pages-- - enqueue f2fs_verity_work() - f2fs_verity_work() handle first bio - f2fs_verify_bio() - dic->pending_pages-- - f2fs_post_read_work() handle second bio - f2fs_decompress_work() - enqueue f2fs_verity_work() - f2fs_verify_pages() - f2fs_free_dic() - f2fs_verity_work() handle second bio - f2fs_verfy_bio() - use-after-free on dic Signed-off-by: Daeho Jeong Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 2 -- fs/f2fs/data.c | 58 +++++++++++++++++++++++++++++++++++++--------- fs/f2fs/f2fs.h | 1 + 3 files changed, 48 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index f456d972ae90..09b92c11cffb 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -792,8 +792,6 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) if (cops->destroy_decompress_ctx) cops->destroy_decompress_ctx(dic); out_free_dic: - if (verity) - atomic_set(&dic->pending_pages, dic->nr_cpages); if (!verity) f2fs_decompress_end_io(dic->rpages, dic->cluster_size, ret, false); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 81577dd9e680..6f8d44a64303 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -200,7 +200,7 @@ static void f2fs_verify_bio(struct bio *bio) dic = (struct decompress_io_ctx *)page_private(page); if (dic) { - if (atomic_dec_return(&dic->pending_pages)) + if (atomic_dec_return(&dic->verity_pages)) continue; f2fs_verify_pages(dic->rpages, dic->cluster_size); @@ -984,7 +984,8 @@ static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx) static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, unsigned nr_pages, unsigned op_flag, - pgoff_t first_idx, bool for_write) + pgoff_t first_idx, bool for_write, + bool for_verity) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio; @@ -1003,7 +1004,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, post_read_steps |= 1 << STEP_DECRYPT; if (f2fs_compressed_file(inode)) post_read_steps |= 1 << STEP_DECOMPRESS_NOWQ; - if (f2fs_need_verity(inode, first_idx)) + if (for_verity && f2fs_need_verity(inode, first_idx)) post_read_steps |= 1 << STEP_VERITY; if (post_read_steps) { @@ -1033,7 +1034,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page, struct bio *bio; bio = f2fs_grab_read_bio(inode, blkaddr, 1, op_flags, - page->index, for_write); + page->index, for_write, true); if (IS_ERR(bio)) return PTR_ERR(bio); @@ -2070,7 +2071,7 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page, if (bio == NULL) { bio = f2fs_grab_read_bio(inode, block_nr, nr_pages, is_readahead ? REQ_RAHEAD : 0, page->index, - false); + false, true); if (IS_ERR(bio)) { ret = PTR_ERR(bio); bio = NULL; @@ -2116,6 +2117,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, sector_t last_block_in_file; const unsigned blocksize = blks_to_bytes(inode, 1); struct decompress_io_ctx *dic = NULL; + struct bio_post_read_ctx *ctx; + bool for_verity = false; int i; int ret = 0; @@ -2181,10 +2184,29 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, goto out_put_dnode; } + /* + * It's possible to enable fsverity on the fly when handling a cluster, + * which requires complicated error handling. Instead of adding more + * complexity, let's give a rule where end_io post-processes fsverity + * per cluster. In order to do that, we need to submit bio, if previous + * bio sets a different post-process policy. + */ + if (fsverity_active(cc->inode)) { + atomic_set(&dic->verity_pages, cc->nr_cpages); + for_verity = true; + + if (bio) { + ctx = bio->bi_private; + if (!(ctx->enabled_steps & (1 << STEP_VERITY))) { + __submit_bio(sbi, bio, DATA); + bio = NULL; + } + } + } + for (i = 0; i < dic->nr_cpages; i++) { struct page *page = dic->cpages[i]; block_t blkaddr; - struct bio_post_read_ctx *ctx; blkaddr = data_blkaddr(dn.inode, dn.node_page, dn.ofs_in_node + i + 1); @@ -2199,17 +2221,31 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, if (!bio) { bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages, is_readahead ? REQ_RAHEAD : 0, - page->index, for_write); + page->index, for_write, for_verity); if (IS_ERR(bio)) { + unsigned int remained = dic->nr_cpages - i; + bool release = false; + ret = PTR_ERR(bio); dic->failed = true; - if (!atomic_sub_return(dic->nr_cpages - i, - &dic->pending_pages)) { + + if (for_verity) { + if (!atomic_sub_return(remained, + &dic->verity_pages)) + release = true; + } else { + if (!atomic_sub_return(remained, + &dic->pending_pages)) + release = true; + } + + if (release) { f2fs_decompress_end_io(dic->rpages, - cc->cluster_size, true, - false); + cc->cluster_size, true, + false); f2fs_free_dic(dic); } + f2fs_put_dnode(&dn); *bio_ret = NULL; return ret; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1d52018101c8..322b51809a39 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1358,6 +1358,7 @@ struct decompress_io_ctx { size_t rlen; /* valid data length in rbuf */ size_t clen; /* valid data length in cbuf */ atomic_t pending_pages; /* in-flight compressed page count */ + atomic_t verity_pages; /* in-flight page count for verity */ bool failed; /* indicate IO error during decompression */ void *private; /* payload buffer for specified decompression algorithm */ void *private2; /* extra payload buffer */ From 0283c68c08146855c855f7efad3d552bef696b94 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 9 Dec 2020 16:49:36 +0800 Subject: [PATCH 141/580] f2fs: fix shift-out-of-bounds in sanity_check_raw_super() syzbot reported a bug which could cause shift-out-of-bounds issue, fix it. Call Trace: __dump_stack lib/dump_stack.c:79 [inline] dump_stack+0x107/0x163 lib/dump_stack.c:120 ubsan_epilogue+0xb/0x5a lib/ubsan.c:148 __ubsan_handle_shift_out_of_bounds.cold+0xb1/0x181 lib/ubsan.c:395 sanity_check_raw_super fs/f2fs/super.c:2812 [inline] read_raw_super_block fs/f2fs/super.c:3267 [inline] f2fs_fill_super.cold+0x16c9/0x16f6 fs/f2fs/super.c:3519 mount_bdev+0x34d/0x410 fs/super.c:1366 legacy_get_tree+0x105/0x220 fs/fs_context.c:592 vfs_get_tree+0x89/0x2f0 fs/super.c:1496 do_new_mount fs/namespace.c:2896 [inline] path_mount+0x12ae/0x1e70 fs/namespace.c:3227 do_mount fs/namespace.c:3240 [inline] __do_sys_mount fs/namespace.c:3448 [inline] __se_sys_mount fs/namespace.c:3425 [inline] __x64_sys_mount+0x27f/0x300 fs/namespace.c:3425 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Reported-by: syzbot+ca9a785f8ac472085994@syzkaller.appspotmail.com Signed-off-by: Anant Thazhemadam Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7af4d8a499bd..b587d94f5e6d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2742,7 +2742,6 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, block_t total_sections, blocks_per_seg; struct f2fs_super_block *raw_super = (struct f2fs_super_block *) (bh->b_data + F2FS_SUPER_OFFSET); - unsigned int blocksize; size_t crc_offset = 0; __u32 crc = 0; @@ -2769,10 +2768,10 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, } /* Currently, support only 4KB block size */ - blocksize = 1 << le32_to_cpu(raw_super->log_blocksize); - if (blocksize != F2FS_BLKSIZE) { - f2fs_info(sbi, "Invalid blocksize (%u), supports only 4KB", - blocksize); + if (le32_to_cpu(raw_super->log_blocksize) != F2FS_BLKSIZE_BITS) { + f2fs_info(sbi, "Invalid log_blocksize (%u), supports only %u", + le32_to_cpu(raw_super->log_blocksize), + F2FS_BLKSIZE_BITS); return -EFSCORRUPTED; } From 010b090e4e64913748f16c28168be674a4139e14 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 9 Dec 2020 16:42:14 +0800 Subject: [PATCH 142/580] f2fs: compress: fix compression chksum This patch addresses minor issues in compression chksum. Fixes: b28f047b28c5 ("f2fs: compress: support chksum") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 3 +-- fs/f2fs/compress.h | 0 2 files changed, 1 insertion(+), 2 deletions(-) create mode 100644 fs/f2fs/compress.h diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 09b92c11cffb..a4c6e9af4322 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -767,7 +767,7 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) ret = cops->decompress_pages(dic); - if (!ret && fi->i_compress_flag & 1 << COMPRESS_CHKSUM) { + if (!ret && (fi->i_compress_flag & 1 << COMPRESS_CHKSUM)) { u32 provided = le32_to_cpu(dic->cbuf->chksum); u32 calculated = f2fs_crc32(sbi, dic->cbuf->cdata, dic->clen); @@ -780,7 +780,6 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) provided, calculated); } set_sbi_flag(sbi, SBI_NEED_FSCK); - WARN_ON_ONCE(1); } } diff --git a/fs/f2fs/compress.h b/fs/f2fs/compress.h new file mode 100644 index 000000000000..e69de29bb2d1 From 09ac2b60db1e857aa06633d980415b77b478b3c5 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 23 Dec 2020 11:44:25 -0800 Subject: [PATCH 143/580] f2fs: handle unallocated section and zone on pinned/atgc If we have large section/zone, unallocated segment makes them corrupted. E.g., - Pinned file: -1 119304647 119304647 - ATGC data: -1 119304647 119304647 Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index e81eb0748e2a..229814b4f4a6 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -101,11 +101,11 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, #define BLKS_PER_SEC(sbi) \ ((sbi)->segs_per_sec * (sbi)->blocks_per_seg) #define GET_SEC_FROM_SEG(sbi, segno) \ - ((segno) / (sbi)->segs_per_sec) + (((segno) == -1) ? -1: (segno) / (sbi)->segs_per_sec) #define GET_SEG_FROM_SEC(sbi, secno) \ ((secno) * (sbi)->segs_per_sec) #define GET_ZONE_FROM_SEC(sbi, secno) \ - ((secno) / (sbi)->secs_per_zone) + (((secno) == -1) ? -1: (secno) / (sbi)->secs_per_zone) #define GET_ZONE_FROM_SEG(sbi, segno) \ GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno)) From 1bc0d459811e15a2144227dc98f471120bc680ed Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Tue, 22 Dec 2020 21:34:15 +0800 Subject: [PATCH 144/580] f2fs: Replace expression with offsetof() Use the existing offsetof() macro instead of duplicating code. Signed-off-by: Zheng Yongjun Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 8861838f49c9..5f561eb33d86 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2696,7 +2696,7 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) src = F2FS_INODE(page); dst = F2FS_INODE(ipage); - memcpy(dst, src, (unsigned long)&src->i_ext - (unsigned long)src); + memcpy(dst, src, offsetof(struct f2fs_inode, i_ext)); dst->i_size = 0; dst->i_blocks = cpu_to_le64(1); dst->i_links = cpu_to_le32(1); From 5f1583810b404e3bca274634b6969d1b813af219 Mon Sep 17 00:00:00 2001 From: Weichao Guo Date: Mon, 14 Dec 2020 11:54:53 +0800 Subject: [PATCH 145/580] f2fs: fix to set inode->i_mode correctly for posix_acl_update_mode We should update the ~S_IRWXUGO part of inode->i_mode in __setattr_copy, because posix_acl_update_mode updates mode based on inode->i_mode, which finally overwrites the ~S_IRWXUGO part of i_acl_mode with old i_mode. Testcase to reproduce this bug: 0. adduser abc 1. mkfs.f2fs /dev/sdd 2. mount -t f2fs /dev/sdd /mnt/f2fs 3. mkdir /mnt/f2fs/test 4. setfacl -m u:abc:r /mnt/f2fs/test 5. chmod +s /mnt/f2fs/test Signed-off-by: Weichao Guo Signed-off-by: Bin Shu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 84d4043103ce..9e2d604d5760 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -872,6 +872,7 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) mode &= ~S_ISGID; + inode->i_mode = (inode->i_mode & S_IRWXUGO) | (mode & ~S_IRWXUGO); set_acl_inode(inode, mode); } } From 46000d2be7209e2f732757c0faaa22262f15fa8c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 25 Dec 2020 16:52:27 +0800 Subject: [PATCH 146/580] f2fs: enhance to update i_mode and acl atomically in f2fs_setattr() Previously, in f2fs_setattr(), we don't update S_ISUID|S_ISGID|S_ISVTX bits with S_IRWXUGO bits and acl entries atomically, so in error path, chmod() may partially success, this patch enhances to make chmod() flow being atomical. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 23 ++++++++++++++++++++++- fs/f2fs/file.c | 7 ++++--- fs/f2fs/xattr.c | 15 +++++++++------ 3 files changed, 35 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 1e5e9b1136ee..732ec10e7890 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -200,6 +200,27 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type) return __f2fs_get_acl(inode, type, NULL); } +static int f2fs_acl_update_mode(struct inode *inode, umode_t *mode_p, + struct posix_acl **acl) +{ + umode_t mode = inode->i_mode; + int error; + + if (is_inode_flag_set(inode, FI_ACL_MODE)) + mode = F2FS_I(inode)->i_acl_mode; + + error = posix_acl_equiv_mode(*acl, &mode); + if (error < 0) + return error; + if (error == 0) + *acl = NULL; + if (!in_group_p(inode->i_gid) && + !capable_wrt_inode_uidgid(inode, CAP_FSETID)) + mode &= ~S_ISGID; + *mode_p = mode; + return 0; +} + static int __f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl, struct page *ipage) { @@ -213,7 +234,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl && !ipage) { - error = posix_acl_update_mode(inode, &mode, &acl); + error = f2fs_acl_update_mode(inode, &mode, &acl); if (error) return error; set_acl_inode(inode, mode); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 9e2d604d5760..c2753e2656ff 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -872,7 +872,6 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) mode &= ~S_ISGID; - inode->i_mode = (inode->i_mode & S_IRWXUGO) | (mode & ~S_IRWXUGO); set_acl_inode(inode, mode); } } @@ -972,8 +971,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_MODE) { err = posix_acl_chmod(inode, f2fs_get_inode_mode(inode)); - if (err || is_inode_flag_set(inode, FI_ACL_MODE)) { - inode->i_mode = F2FS_I(inode)->i_acl_mode; + + if (is_inode_flag_set(inode, FI_ACL_MODE)) { + if (!err) + inode->i_mode = F2FS_I(inode)->i_acl_mode; clear_inode_flag(inode, FI_ACL_MODE); } } diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 65afcc3cc68a..2086bef6c154 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -673,7 +673,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, } if (value && f2fs_xattr_value_same(here, value, size)) - goto exit; + goto same; } else if ((flags & XATTR_REPLACE)) { error = -ENODATA; goto exit; @@ -738,17 +738,20 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (error) goto exit; - if (is_inode_flag_set(inode, FI_ACL_MODE)) { - inode->i_mode = F2FS_I(inode)->i_acl_mode; - inode->i_ctime = current_time(inode); - clear_inode_flag(inode, FI_ACL_MODE); - } if (index == F2FS_XATTR_INDEX_ENCRYPTION && !strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT)) f2fs_set_encrypted_inode(inode); f2fs_mark_inode_dirty_sync(inode, true); if (!error && S_ISDIR(inode->i_mode)) set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP); + +same: + if (is_inode_flag_set(inode, FI_ACL_MODE)) { + inode->i_mode = F2FS_I(inode)->i_acl_mode; + inode->i_ctime = current_time(inode); + clear_inode_flag(inode, FI_ACL_MODE); + } + exit: kfree(base_addr); return error; From a5cf21779ae2cfca9b9fe389f4a93f1f15983c42 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 26 Dec 2020 18:07:01 +0800 Subject: [PATCH 147/580] f2fs: enforce the immutable flag on open files This patch ports commit 02b016ca7f99 ("ext4: enforce the immutable flag on open files") to f2fs. According to the chattr man page, "a file with the 'i' attribute cannot be modified..." Historically, this was only enforced when the file was opened, per the rest of the description, "... and the file can not be opened in write mode". There is general agreement that we should standardize all file systems to prevent modifications even for files that were opened at the time the immutable flag is set. Eventually, a change to enforce this at the VFS layer should be landing in mainline. Cc: stable@kernel.org Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c2753e2656ff..09507d7c4c00 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -60,6 +60,9 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) bool need_alloc = true; int err = 0; + if (unlikely(IS_IMMUTABLE(inode))) + return VM_FAULT_SIGBUS; + if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; goto err; @@ -887,6 +890,14 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; + + if (unlikely(IS_APPEND(inode) && + (attr->ia_valid & (ATTR_MODE | ATTR_UID | + ATTR_GID | ATTR_TIMES_SET)))) + return -EPERM; + if ((attr->ia_valid & ATTR_SIZE) && !f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; @@ -4374,6 +4385,11 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) inode_lock(inode); } + if (unlikely(IS_IMMUTABLE(inode))) { + ret = -EPERM; + goto unlock; + } + ret = generic_write_checks(iocb, from); if (ret > 0) { bool preallocated = false; @@ -4438,6 +4454,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret > 0) f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); } +unlock: inode_unlock(inode); out: trace_f2fs_file_write_iter(inode, iocb->ki_pos, From 565a750b34cc7106cc2f9bf39c5ec06f1a3f3c05 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 26 Dec 2020 18:07:41 +0800 Subject: [PATCH 148/580] f2fs: relocate f2fs_precache_extents() Relocate f2fs_precache_extents() in prior to check_swap_activate(), then extent cache can be enabled before its use. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 6f8d44a64303..9b1b76de66b3 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -4056,12 +4056,13 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, if (!f2fs_disable_compressed_file(inode)) return -EINVAL; + f2fs_precache_extents(inode); + ret = check_swap_activate(sis, file, span); if (ret < 0) return ret; set_inode_flag(inode, FI_PIN_FILE); - f2fs_precache_extents(inode); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return ret; } From 945a315ae739c7077aff28c54586cbc7068f1ee6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 22 Jan 2021 17:40:13 +0800 Subject: [PATCH 149/580] f2fs: compress: deny setting unsupported compress algorithm If kernel doesn't support certain kinds of compress algorithm, deny to set them as compress algorithm of f2fs via 'compress_algorithm=%s' mount option. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b587d94f5e6d..e0c4d303ec0f 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -872,14 +872,26 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) if (!name) return -ENOMEM; if (!strcmp(name, "lzo")) { +#ifdef CONFIG_F2FS_FS_LZO F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZO; +#else + f2fs_info(sbi, "kernel doesn't support lzo compression"); +#endif } else if (!strcmp(name, "lz4")) { +#ifdef CONFIG_F2FS_FS_LZ4 F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4; +#else + f2fs_info(sbi, "kernel doesn't support lz4 compression"); +#endif } else if (!strcmp(name, "zstd")) { +#ifdef CONFIG_F2FS_FS_ZSTD F2FS_OPTION(sbi).compress_algorithm = COMPRESS_ZSTD; +#else + f2fs_info(sbi, "kernel doesn't support zstd compression"); +#endif } else { kfree(name); return -EINVAL; From bc20f67bfa05b0bf6ad6949a576f1dd789c2cfa5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 22 Jan 2021 17:46:43 +0800 Subject: [PATCH 150/580] f2fs: compress: support compress level Expand 'compress_algorithm' mount option to accept parameter as format of :, by this way, it gives a way to allow user to do more specified config on lz4 and zstd compression level, then f2fs compression can provide higher compress ratio. In order to set compress level for lz4 algorithm, it needs to set CONFIG_LZ4HC_COMPRESS and CONFIG_F2FS_FS_LZ4HC config to enable lz4hc compress algorithm. CR and performance number on lz4/lz4hc algorithm: dd if=enwik9 of=compressed_file conv=fsync Original blocks: 244382 lz4 lz4hc-9 compressed blocks 170647 163270 compress ratio 69.8% 66.8% speed 16.4207 s, 60.9 MB/s 26.7299 s, 37.4 MB/s compress ratio = after / before Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 867 +++++++++++++++++++++++++++++ fs/f2fs/Kconfig | 10 + fs/f2fs/compress.c | 41 +- fs/f2fs/f2fs.h | 9 + fs/f2fs/super.c | 88 ++- include/linux/f2fs_fs.h | 3 + 6 files changed, 1013 insertions(+), 5 deletions(-) create mode 100644 Documentation/filesystems/f2fs.rst diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst new file mode 100644 index 000000000000..5eff4009e77e --- /dev/null +++ b/Documentation/filesystems/f2fs.rst @@ -0,0 +1,867 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================================== +WHAT IS Flash-Friendly File System (F2FS)? +========================================== + +NAND flash memory-based storage devices, such as SSD, eMMC, and SD cards, have +been equipped on a variety systems ranging from mobile to server systems. Since +they are known to have different characteristics from the conventional rotating +disks, a file system, an upper layer to the storage device, should adapt to the +changes from the sketch in the design level. + +F2FS is a file system exploiting NAND flash memory-based storage devices, which +is based on Log-structured File System (LFS). The design has been focused on +addressing the fundamental issues in LFS, which are snowball effect of wandering +tree and high cleaning overhead. + +Since a NAND flash memory-based storage device shows different characteristic +according to its internal geometry or flash memory management scheme, namely FTL, +F2FS and its tools support various parameters not only for configuring on-disk +layout, but also for selecting allocation and cleaning algorithms. + +The following git tree provides the file system formatting tool (mkfs.f2fs), +a consistency checking tool (fsck.f2fs), and a debugging tool (dump.f2fs). + +- git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs-tools.git + +For reporting bugs and sending patches, please use the following mailing list: + +- linux-f2fs-devel@lists.sourceforge.net + +Background and Design issues +============================ + +Log-structured File System (LFS) +-------------------------------- +"A log-structured file system writes all modifications to disk sequentially in +a log-like structure, thereby speeding up both file writing and crash recovery. +The log is the only structure on disk; it contains indexing information so that +files can be read back from the log efficiently. In order to maintain large free +areas on disk for fast writing, we divide the log into segments and use a +segment cleaner to compress the live information from heavily fragmented +segments." from Rosenblum, M. and Ousterhout, J. K., 1992, "The design and +implementation of a log-structured file system", ACM Trans. Computer Systems +10, 1, 26–52. + +Wandering Tree Problem +---------------------- +In LFS, when a file data is updated and written to the end of log, its direct +pointer block is updated due to the changed location. Then the indirect pointer +block is also updated due to the direct pointer block update. In this manner, +the upper index structures such as inode, inode map, and checkpoint block are +also updated recursively. This problem is called as wandering tree problem [1], +and in order to enhance the performance, it should eliminate or relax the update +propagation as much as possible. + +[1] Bityutskiy, A. 2005. JFFS3 design issues. http://www.linux-mtd.infradead.org/ + +Cleaning Overhead +----------------- +Since LFS is based on out-of-place writes, it produces so many obsolete blocks +scattered across the whole storage. In order to serve new empty log space, it +needs to reclaim these obsolete blocks seamlessly to users. This job is called +as a cleaning process. + +The process consists of three operations as follows. + +1. A victim segment is selected through referencing segment usage table. +2. It loads parent index structures of all the data in the victim identified by + segment summary blocks. +3. It checks the cross-reference between the data and its parent index structure. +4. It moves valid data selectively. + +This cleaning job may cause unexpected long delays, so the most important goal +is to hide the latencies to users. And also definitely, it should reduce the +amount of valid data to be moved, and move them quickly as well. + +Key Features +============ + +Flash Awareness +--------------- +- Enlarge the random write area for better performance, but provide the high + spatial locality +- Align FS data structures to the operational units in FTL as best efforts + +Wandering Tree Problem +---------------------- +- Use a term, “node”, that represents inodes as well as various pointer blocks +- Introduce Node Address Table (NAT) containing the locations of all the “node” + blocks; this will cut off the update propagation. + +Cleaning Overhead +----------------- +- Support a background cleaning process +- Support greedy and cost-benefit algorithms for victim selection policies +- Support multi-head logs for static/dynamic hot and cold data separation +- Introduce adaptive logging for efficient block allocation + +Mount Options +============= + + +======================== ============================================================ +background_gc=%s Turn on/off cleaning operations, namely garbage + collection, triggered in background when I/O subsystem is + idle. If background_gc=on, it will turn on the garbage + collection and if background_gc=off, garbage collection + will be turned off. If background_gc=sync, it will turn + on synchronous garbage collection running in background. + Default value for this option is on. So garbage + collection is on by default. +disable_roll_forward Disable the roll-forward recovery routine +norecovery Disable the roll-forward recovery routine, mounted read- + only (i.e., -o ro,disable_roll_forward) +discard/nodiscard Enable/disable real-time discard in f2fs, if discard is + enabled, f2fs will issue discard/TRIM commands when a + segment is cleaned. +no_heap Disable heap-style segment allocation which finds free + segments for data from the beginning of main area, while + for node from the end of main area. +nouser_xattr Disable Extended User Attributes. Note: xattr is enabled + by default if CONFIG_F2FS_FS_XATTR is selected. +noacl Disable POSIX Access Control List. Note: acl is enabled + by default if CONFIG_F2FS_FS_POSIX_ACL is selected. +active_logs=%u Support configuring the number of active logs. In the + current design, f2fs supports only 2, 4, and 6 logs. + Default number is 6. +disable_ext_identify Disable the extension list configured by mkfs, so f2fs + is not aware of cold files such as media files. +inline_xattr Enable the inline xattrs feature. +noinline_xattr Disable the inline xattrs feature. +inline_xattr_size=%u Support configuring inline xattr size, it depends on + flexible inline xattr feature. +inline_data Enable the inline data feature: Newly created small (<~3.4k) + files can be written into inode block. +inline_dentry Enable the inline dir feature: data in newly created + directory entries can be written into inode block. The + space of inode block which is used to store inline + dentries is limited to ~3.4k. +noinline_dentry Disable the inline dentry feature. +flush_merge Merge concurrent cache_flush commands as much as possible + to eliminate redundant command issues. If the underlying + device handles the cache_flush command relatively slowly, + recommend to enable this option. +nobarrier This option can be used if underlying storage guarantees + its cached data should be written to the novolatile area. + If this option is set, no cache_flush commands are issued + but f2fs still guarantees the write ordering of all the + data writes. +fastboot This option is used when a system wants to reduce mount + time as much as possible, even though normal performance + can be sacrificed. +extent_cache Enable an extent cache based on rb-tree, it can cache + as many as extent which map between contiguous logical + address and physical address per inode, resulting in + increasing the cache hit ratio. Set by default. +noextent_cache Disable an extent cache based on rb-tree explicitly, see + the above extent_cache mount option. +noinline_data Disable the inline data feature, inline data feature is + enabled by default. +data_flush Enable data flushing before checkpoint in order to + persist data of regular and symlink. +reserve_root=%d Support configuring reserved space which is used for + allocation from a privileged user with specified uid or + gid, unit: 4KB, the default limit is 0.2% of user blocks. +resuid=%d The user ID which may use the reserved blocks. +resgid=%d The group ID which may use the reserved blocks. +fault_injection=%d Enable fault injection in all supported types with + specified injection rate. +fault_type=%d Support configuring fault injection type, should be + enabled with fault_injection option, fault type value + is shown below, it supports single or combined type. + + =================== =========== + Type_Name Type_Value + =================== =========== + FAULT_KMALLOC 0x000000001 + FAULT_KVMALLOC 0x000000002 + FAULT_PAGE_ALLOC 0x000000004 + FAULT_PAGE_GET 0x000000008 + FAULT_ALLOC_BIO 0x000000010 + FAULT_ALLOC_NID 0x000000020 + FAULT_ORPHAN 0x000000040 + FAULT_BLOCK 0x000000080 + FAULT_DIR_DEPTH 0x000000100 + FAULT_EVICT_INODE 0x000000200 + FAULT_TRUNCATE 0x000000400 + FAULT_READ_IO 0x000000800 + FAULT_CHECKPOINT 0x000001000 + FAULT_DISCARD 0x000002000 + FAULT_WRITE_IO 0x000004000 + =================== =========== +mode=%s Control block allocation mode which supports "adaptive" + and "lfs". In "lfs" mode, there should be no random + writes towards main area. +io_bits=%u Set the bit size of write IO requests. It should be set + with "mode=lfs". +usrquota Enable plain user disk quota accounting. +grpquota Enable plain group disk quota accounting. +prjquota Enable plain project quota accounting. +usrjquota= Appoint specified file and type during mount, so that quota +grpjquota= information can be properly updated during recovery flow, +prjjquota= : must be in root directory; +jqfmt= : [vfsold,vfsv0,vfsv1]. +offusrjquota Turn off user journalled quota. +offgrpjquota Turn off group journalled quota. +offprjjquota Turn off project journalled quota. +quota Enable plain user disk quota accounting. +noquota Disable all plain disk quota option. +whint_mode=%s Control which write hints are passed down to block + layer. This supports "off", "user-based", and + "fs-based". In "off" mode (default), f2fs does not pass + down hints. In "user-based" mode, f2fs tries to pass + down hints given by users. And in "fs-based" mode, f2fs + passes down hints with its policy. +alloc_mode=%s Adjust block allocation policy, which supports "reuse" + and "default". +fsync_mode=%s Control the policy of fsync. Currently supports "posix", + "strict", and "nobarrier". In "posix" mode, which is + default, fsync will follow POSIX semantics and does a + light operation to improve the filesystem performance. + In "strict" mode, fsync will be heavy and behaves in line + with xfs, ext4 and btrfs, where xfstest generic/342 will + pass, but the performance will regress. "nobarrier" is + based on "posix", but doesn't issue flush command for + non-atomic files likewise "nobarrier" mount option. +test_dummy_encryption +test_dummy_encryption=%s + Enable dummy encryption, which provides a fake fscrypt + context. The fake fscrypt context is used by xfstests. + The argument may be either "v1" or "v2", in order to + select the corresponding fscrypt policy version. +checkpoint=%s[:%u[%]] Set to "disable" to turn off checkpointing. Set to "enable" + to reenable checkpointing. Is enabled by default. While + disabled, any unmounting or unexpected shutdowns will cause + the filesystem contents to appear as they did when the + filesystem was mounted with that option. + While mounting with checkpoint=disabled, the filesystem must + run garbage collection to ensure that all available space can + be used. If this takes too much time, the mount may return + EAGAIN. You may optionally add a value to indicate how much + of the disk you would be willing to temporarily give up to + avoid additional garbage collection. This can be given as a + number of blocks, or as a percent. For instance, mounting + with checkpoint=disable:100% would always succeed, but it may + hide up to all remaining free space. The actual space that + would be unusable can be viewed at /sys/fs/f2fs//unusable + This space is reclaimed once checkpoint=enable. +compress_algorithm=%s Control compress algorithm, currently f2fs supports "lzo", + "lz4", "zstd" and "lzo-rle" algorithm. +compress_algorithm=%s:%d Control compress algorithm and its compress level, now, only + "lz4" and "zstd" support compress level config. + algorithm level range + lz4 3 - 16 + zstd 1 - 22 +compress_log_size=%u Support configuring compress cluster size, the size will + be 4KB * (1 << %u), 16KB is minimum size, also it's + default size. +compress_extension=%s Support adding specified extension, so that f2fs can enable + compression on those corresponding files, e.g. if all files + with '.ext' has high compression rate, we can set the '.ext' + on compression extension list and enable compression on + these file by default rather than to enable it via ioctl. + For other files, we can still enable compression via ioctl. + Note that, there is one reserved special extension '*', it + can be set to enable compression for all files. +compress_chksum Support verifying chksum of raw data in compressed cluster. +compress_mode=%s Control file compression mode. This supports "fs" and "user" + modes. In "fs" mode (default), f2fs does automatic compression + on the compression enabled files. In "user" mode, f2fs disables + the automaic compression and gives the user discretion of + choosing the target file and the timing. The user can do manual + compression/decompression on the compression enabled files using + ioctls. +inlinecrypt When possible, encrypt/decrypt the contents of encrypted + files using the blk-crypto framework rather than + filesystem-layer encryption. This allows the use of + inline encryption hardware. The on-disk format is + unaffected. For more details, see + Documentation/block/inline-encryption.rst. +atgc Enable age-threshold garbage collection, it provides high + effectiveness and efficiency on background GC. +======================== ============================================================ + +Debugfs Entries +=============== + +/sys/kernel/debug/f2fs/ contains information about all the partitions mounted as +f2fs. Each file shows the whole f2fs information. + +/sys/kernel/debug/f2fs/status includes: + + - major file system information managed by f2fs currently + - average SIT information about whole segments + - current memory footprint consumed by f2fs. + +Sysfs Entries +============= + +Information about mounted f2fs file systems can be found in +/sys/fs/f2fs. Each mounted filesystem will have a directory in +/sys/fs/f2fs based on its device name (i.e., /sys/fs/f2fs/sda). +The files in each per-device directory are shown in table below. + +Files in /sys/fs/f2fs/ +(see also Documentation/ABI/testing/sysfs-fs-f2fs) + +Usage +===== + +1. Download userland tools and compile them. + +2. Skip, if f2fs was compiled statically inside kernel. + Otherwise, insert the f2fs.ko module:: + + # insmod f2fs.ko + +3. Create a directory to use when mounting:: + + # mkdir /mnt/f2fs + +4. Format the block device, and then mount as f2fs:: + + # mkfs.f2fs -l label /dev/block_device + # mount -t f2fs /dev/block_device /mnt/f2fs + +mkfs.f2fs +--------- +The mkfs.f2fs is for the use of formatting a partition as the f2fs filesystem, +which builds a basic on-disk layout. + +The quick options consist of: + +=============== =========================================================== +``-l [label]`` Give a volume label, up to 512 unicode name. +``-a [0 or 1]`` Split start location of each area for heap-based allocation. + + 1 is set by default, which performs this. +``-o [int]`` Set overprovision ratio in percent over volume size. + + 5 is set by default. +``-s [int]`` Set the number of segments per section. + + 1 is set by default. +``-z [int]`` Set the number of sections per zone. + + 1 is set by default. +``-e [str]`` Set basic extension list. e.g. "mp3,gif,mov" +``-t [0 or 1]`` Disable discard command or not. + + 1 is set by default, which conducts discard. +=============== =========================================================== + +Note: please refer to the manpage of mkfs.f2fs(8) to get full option list. + +fsck.f2fs +--------- +The fsck.f2fs is a tool to check the consistency of an f2fs-formatted +partition, which examines whether the filesystem metadata and user-made data +are cross-referenced correctly or not. +Note that, initial version of the tool does not fix any inconsistency. + +The quick options consist of:: + + -d debug level [default:0] + +Note: please refer to the manpage of fsck.f2fs(8) to get full option list. + +dump.f2fs +--------- +The dump.f2fs shows the information of specific inode and dumps SSA and SIT to +file. Each file is dump_ssa and dump_sit. + +The dump.f2fs is used to debug on-disk data structures of the f2fs filesystem. +It shows on-disk inode information recognized by a given inode number, and is +able to dump all the SSA and SIT entries into predefined files, ./dump_ssa and +./dump_sit respectively. + +The options consist of:: + + -d debug level [default:0] + -i inode no (hex) + -s [SIT dump segno from #1~#2 (decimal), for all 0~-1] + -a [SSA dump segno from #1~#2 (decimal), for all 0~-1] + +Examples:: + + # dump.f2fs -i [ino] /dev/sdx + # dump.f2fs -s 0~-1 /dev/sdx (SIT dump) + # dump.f2fs -a 0~-1 /dev/sdx (SSA dump) + +Note: please refer to the manpage of dump.f2fs(8) to get full option list. + +sload.f2fs +---------- +The sload.f2fs gives a way to insert files and directories in the exisiting disk +image. This tool is useful when building f2fs images given compiled files. + +Note: please refer to the manpage of sload.f2fs(8) to get full option list. + +resize.f2fs +----------- +The resize.f2fs lets a user resize the f2fs-formatted disk image, while preserving +all the files and directories stored in the image. + +Note: please refer to the manpage of resize.f2fs(8) to get full option list. + +defrag.f2fs +----------- +The defrag.f2fs can be used to defragment scattered written data as well as +filesystem metadata across the disk. This can improve the write speed by giving +more free consecutive space. + +Note: please refer to the manpage of defrag.f2fs(8) to get full option list. + +f2fs_io +------- +The f2fs_io is a simple tool to issue various filesystem APIs as well as +f2fs-specific ones, which is very useful for QA tests. + +Note: please refer to the manpage of f2fs_io(8) to get full option list. + +Design +====== + +On-disk Layout +-------------- + +F2FS divides the whole volume into a number of segments, each of which is fixed +to 2MB in size. A section is composed of consecutive segments, and a zone +consists of a set of sections. By default, section and zone sizes are set to one +segment size identically, but users can easily modify the sizes by mkfs. + +F2FS splits the entire volume into six areas, and all the areas except superblock +consist of multiple segments as described below:: + + align with the zone size <-| + |-> align with the segment size + _________________________________________________________________________ + | | | Segment | Node | Segment | | + | Superblock | Checkpoint | Info. | Address | Summary | Main | + | (SB) | (CP) | Table (SIT) | Table (NAT) | Area (SSA) | | + |____________|_____2______|______N______|______N______|______N_____|__N___| + . . + . . + . . + ._________________________________________. + |_Segment_|_..._|_Segment_|_..._|_Segment_| + . . + ._________._________ + |_section_|__...__|_ + . . + .________. + |__zone__| + +- Superblock (SB) + It is located at the beginning of the partition, and there exist two copies + to avoid file system crash. It contains basic partition information and some + default parameters of f2fs. + +- Checkpoint (CP) + It contains file system information, bitmaps for valid NAT/SIT sets, orphan + inode lists, and summary entries of current active segments. + +- Segment Information Table (SIT) + It contains segment information such as valid block count and bitmap for the + validity of all the blocks. + +- Node Address Table (NAT) + It is composed of a block address table for all the node blocks stored in + Main area. + +- Segment Summary Area (SSA) + It contains summary entries which contains the owner information of all the + data and node blocks stored in Main area. + +- Main Area + It contains file and directory data including their indices. + +In order to avoid misalignment between file system and flash-based storage, F2FS +aligns the start block address of CP with the segment size. Also, it aligns the +start block address of Main area with the zone size by reserving some segments +in SSA area. + +Reference the following survey for additional technical details. +https://wiki.linaro.org/WorkingGroups/Kernel/Projects/FlashCardSurvey + +File System Metadata Structure +------------------------------ + +F2FS adopts the checkpointing scheme to maintain file system consistency. At +mount time, F2FS first tries to find the last valid checkpoint data by scanning +CP area. In order to reduce the scanning time, F2FS uses only two copies of CP. +One of them always indicates the last valid data, which is called as shadow copy +mechanism. In addition to CP, NAT and SIT also adopt the shadow copy mechanism. + +For file system consistency, each CP points to which NAT and SIT copies are +valid, as shown as below:: + + +--------+----------+---------+ + | CP | SIT | NAT | + +--------+----------+---------+ + . . . . + . . . . + . . . . + +-------+-------+--------+--------+--------+--------+ + | CP #0 | CP #1 | SIT #0 | SIT #1 | NAT #0 | NAT #1 | + +-------+-------+--------+--------+--------+--------+ + | ^ ^ + | | | + `----------------------------------------' + +Index Structure +--------------- + +The key data structure to manage the data locations is a "node". Similar to +traditional file structures, F2FS has three types of node: inode, direct node, +indirect node. F2FS assigns 4KB to an inode block which contains 923 data block +indices, two direct node pointers, two indirect node pointers, and one double +indirect node pointer as described below. One direct node block contains 1018 +data blocks, and one indirect node block contains also 1018 node blocks. Thus, +one inode block (i.e., a file) covers:: + + 4KB * (923 + 2 * 1018 + 2 * 1018 * 1018 + 1018 * 1018 * 1018) := 3.94TB. + + Inode block (4KB) + |- data (923) + |- direct node (2) + | `- data (1018) + |- indirect node (2) + | `- direct node (1018) + | `- data (1018) + `- double indirect node (1) + `- indirect node (1018) + `- direct node (1018) + `- data (1018) + +Note that all the node blocks are mapped by NAT which means the location of +each node is translated by the NAT table. In the consideration of the wandering +tree problem, F2FS is able to cut off the propagation of node updates caused by +leaf data writes. + +Directory Structure +------------------- + +A directory entry occupies 11 bytes, which consists of the following attributes. + +- hash hash value of the file name +- ino inode number +- len the length of file name +- type file type such as directory, symlink, etc + +A dentry block consists of 214 dentry slots and file names. Therein a bitmap is +used to represent whether each dentry is valid or not. A dentry block occupies +4KB with the following composition. + +:: + + Dentry Block(4 K) = bitmap (27 bytes) + reserved (3 bytes) + + dentries(11 * 214 bytes) + file name (8 * 214 bytes) + + [Bucket] + +--------------------------------+ + |dentry block 1 | dentry block 2 | + +--------------------------------+ + . . + . . + . [Dentry Block Structure: 4KB] . + +--------+----------+----------+------------+ + | bitmap | reserved | dentries | file names | + +--------+----------+----------+------------+ + [Dentry Block: 4KB] . . + . . + . . + +------+------+-----+------+ + | hash | ino | len | type | + +------+------+-----+------+ + [Dentry Structure: 11 bytes] + +F2FS implements multi-level hash tables for directory structure. Each level has +a hash table with dedicated number of hash buckets as shown below. Note that +"A(2B)" means a bucket includes 2 data blocks. + +:: + + ---------------------- + A : bucket + B : block + N : MAX_DIR_HASH_DEPTH + ---------------------- + + level #0 | A(2B) + | + level #1 | A(2B) - A(2B) + | + level #2 | A(2B) - A(2B) - A(2B) - A(2B) + . | . . . . + level #N/2 | A(2B) - A(2B) - A(2B) - A(2B) - A(2B) - ... - A(2B) + . | . . . . + level #N | A(4B) - A(4B) - A(4B) - A(4B) - A(4B) - ... - A(4B) + +The number of blocks and buckets are determined by:: + + ,- 2, if n < MAX_DIR_HASH_DEPTH / 2, + # of blocks in level #n = | + `- 4, Otherwise + + ,- 2^(n + dir_level), + | if n + dir_level < MAX_DIR_HASH_DEPTH / 2, + # of buckets in level #n = | + `- 2^((MAX_DIR_HASH_DEPTH / 2) - 1), + Otherwise + +When F2FS finds a file name in a directory, at first a hash value of the file +name is calculated. Then, F2FS scans the hash table in level #0 to find the +dentry consisting of the file name and its inode number. If not found, F2FS +scans the next hash table in level #1. In this way, F2FS scans hash tables in +each levels incrementally from 1 to N. In each level F2FS needs to scan only +one bucket determined by the following equation, which shows O(log(# of files)) +complexity:: + + bucket number to scan in level #n = (hash value) % (# of buckets in level #n) + +In the case of file creation, F2FS finds empty consecutive slots that cover the +file name. F2FS searches the empty slots in the hash tables of whole levels from +1 to N in the same way as the lookup operation. + +The following figure shows an example of two cases holding children:: + + --------------> Dir <-------------- + | | + child child + + child - child [hole] - child + + child - child - child [hole] - [hole] - child + + Case 1: Case 2: + Number of children = 6, Number of children = 3, + File size = 7 File size = 7 + +Default Block Allocation +------------------------ + +At runtime, F2FS manages six active logs inside "Main" area: Hot/Warm/Cold node +and Hot/Warm/Cold data. + +- Hot node contains direct node blocks of directories. +- Warm node contains direct node blocks except hot node blocks. +- Cold node contains indirect node blocks +- Hot data contains dentry blocks +- Warm data contains data blocks except hot and cold data blocks +- Cold data contains multimedia data or migrated data blocks + +LFS has two schemes for free space management: threaded log and copy-and-compac- +tion. The copy-and-compaction scheme which is known as cleaning, is well-suited +for devices showing very good sequential write performance, since free segments +are served all the time for writing new data. However, it suffers from cleaning +overhead under high utilization. Contrarily, the threaded log scheme suffers +from random writes, but no cleaning process is needed. F2FS adopts a hybrid +scheme where the copy-and-compaction scheme is adopted by default, but the +policy is dynamically changed to the threaded log scheme according to the file +system status. + +In order to align F2FS with underlying flash-based storage, F2FS allocates a +segment in a unit of section. F2FS expects that the section size would be the +same as the unit size of garbage collection in FTL. Furthermore, with respect +to the mapping granularity in FTL, F2FS allocates each section of the active +logs from different zones as much as possible, since FTL can write the data in +the active logs into one allocation unit according to its mapping granularity. + +Cleaning process +---------------- + +F2FS does cleaning both on demand and in the background. On-demand cleaning is +triggered when there are not enough free segments to serve VFS calls. Background +cleaner is operated by a kernel thread, and triggers the cleaning job when the +system is idle. + +F2FS supports two victim selection policies: greedy and cost-benefit algorithms. +In the greedy algorithm, F2FS selects a victim segment having the smallest number +of valid blocks. In the cost-benefit algorithm, F2FS selects a victim segment +according to the segment age and the number of valid blocks in order to address +log block thrashing problem in the greedy algorithm. F2FS adopts the greedy +algorithm for on-demand cleaner, while background cleaner adopts cost-benefit +algorithm. + +In order to identify whether the data in the victim segment are valid or not, +F2FS manages a bitmap. Each bit represents the validity of a block, and the +bitmap is composed of a bit stream covering whole blocks in main area. + +Write-hint Policy +----------------- + +1) whint_mode=off. F2FS only passes down WRITE_LIFE_NOT_SET. + +2) whint_mode=user-based. F2FS tries to pass down hints given by +users. + +===================== ======================== =================== +User F2FS Block +===================== ======================== =================== + META WRITE_LIFE_NOT_SET + HOT_NODE " + WARM_NODE " + COLD_NODE " +ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME +extension list " " + +-- buffered io +WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME +WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT +WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET +WRITE_LIFE_NONE " " +WRITE_LIFE_MEDIUM " " +WRITE_LIFE_LONG " " + +-- direct io +WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME +WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT +WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET +WRITE_LIFE_NONE " WRITE_LIFE_NONE +WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM +WRITE_LIFE_LONG " WRITE_LIFE_LONG +===================== ======================== =================== + +3) whint_mode=fs-based. F2FS passes down hints with its policy. + +===================== ======================== =================== +User F2FS Block +===================== ======================== =================== + META WRITE_LIFE_MEDIUM; + HOT_NODE WRITE_LIFE_NOT_SET + WARM_NODE " + COLD_NODE WRITE_LIFE_NONE +ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME +extension list " " + +-- buffered io +WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME +WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT +WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_LONG +WRITE_LIFE_NONE " " +WRITE_LIFE_MEDIUM " " +WRITE_LIFE_LONG " " + +-- direct io +WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME +WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT +WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET +WRITE_LIFE_NONE " WRITE_LIFE_NONE +WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM +WRITE_LIFE_LONG " WRITE_LIFE_LONG +===================== ======================== =================== + +Fallocate(2) Policy +------------------- + +The default policy follows the below POSIX rule. + +Allocating disk space + The default operation (i.e., mode is zero) of fallocate() allocates + the disk space within the range specified by offset and len. The + file size (as reported by stat(2)) will be changed if offset+len is + greater than the file size. Any subregion within the range specified + by offset and len that did not contain data before the call will be + initialized to zero. This default behavior closely resembles the + behavior of the posix_fallocate(3) library function, and is intended + as a method of optimally implementing that function. + +However, once F2FS receives ioctl(fd, F2FS_IOC_SET_PIN_FILE) in prior to +fallocate(fd, DEFAULT_MODE), it allocates on-disk block addressess having +zero or random data, which is useful to the below scenario where: + + 1. create(fd) + 2. ioctl(fd, F2FS_IOC_SET_PIN_FILE) + 3. fallocate(fd, 0, 0, size) + 4. address = fibmap(fd, offset) + 5. open(blkdev) + 6. write(blkdev, address) + +Compression implementation +-------------------------- + +- New term named cluster is defined as basic unit of compression, file can + be divided into multiple clusters logically. One cluster includes 4 << n + (n >= 0) logical pages, compression size is also cluster size, each of + cluster can be compressed or not. + +- In cluster metadata layout, one special block address is used to indicate + a cluster is a compressed one or normal one; for compressed cluster, following + metadata maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs + stores data including compress header and compressed data. + +- In order to eliminate write amplification during overwrite, F2FS only + support compression on write-once file, data can be compressed only when + all logical blocks in cluster contain valid data and compress ratio of + cluster data is lower than specified threshold. + +- To enable compression on regular inode, there are three ways: + + * chattr +c file + * chattr +c dir; touch dir/file + * mount w/ -o compress_extension=ext; touch file.ext + +Compress metadata layout:: + + [Dnode Structure] + +-----------------------------------------------+ + | cluster 1 | cluster 2 | ......... | cluster N | + +-----------------------------------------------+ + . . . . + . . . . + . Compressed Cluster . . Normal Cluster . + +----------+---------+---------+---------+ +---------+---------+---------+---------+ + |compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 | + +----------+---------+---------+---------+ +---------+---------+---------+---------+ + . . + . . + . . + +-------------+-------------+----------+----------------------------+ + | data length | data chksum | reserved | compressed data | + +-------------+-------------+----------+----------------------------+ + +Compression mode +-------------------------- + +f2fs supports "fs" and "user" compression modes with "compression_mode" mount option. +With this option, f2fs provides a choice to select the way how to compress the +compression enabled files (refer to "Compression implementation" section for how to +enable compression on a regular inode). + +1) compress_mode=fs +This is the default option. f2fs does automatic compression in the writeback of the +compression enabled files. + +2) compress_mode=user +This disables the automaic compression and gives the user discretion of choosing the +target file and the timing. The user can do manual compression/decompression on the +compression enabled files using F2FS_IOC_DECOMPRESS_FILE and F2FS_IOC_COMPRESS_FILE +ioctls like the below. + +To decompress a file, + +fd = open(filename, O_WRONLY, 0); +ret = ioctl(fd, F2FS_IOC_DECOMPRESS_FILE); + +To compress a file, + +fd = open(filename, O_WRONLY, 0); +ret = ioctl(fd, F2FS_IOC_COMPRESS_FILE); + +NVMe Zoned Namespace devices +---------------------------- + +- ZNS defines a per-zone capacity which can be equal or less than the + zone-size. Zone-capacity is the number of usable blocks in the zone. + F2FS checks if zone-capacity is less than zone-size, if it is, then any + segment which starts after the zone-capacity is marked as not-free in + the free segment bitmap at initial mount time. These segments are marked + as permanently used so they are not allocated for writes and + consequently are not needed to be garbage collected. In case the + zone-capacity is not aligned to default segment size(2MB), then a segment + can start before the zone-capacity and span across zone-capacity boundary. + Such spanning segments are also considered as usable segments. All blocks + past the zone-capacity are considered unusable in these segments. diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 79eed58d9d69..1d2d3f879f6c 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -127,6 +127,16 @@ config F2FS_FS_LZ4 help Support LZ4 compress algorithm, if unsure, say Y. +config F2FS_FS_LZ4HC + bool "LZ4HC compression support" + depends on F2FS_FS_COMPRESSION + depends on F2FS_FS_LZ4 + select LZ4HC_COMPRESS + default y + help + Support LZ4HC compress algorithm, LZ4HC has compatible on-disk + layout with LZ4, if unsure, say Y. + config F2FS_FS_ZSTD bool "ZSTD compression support" depends on F2FS_FS_COMPRESSION diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index a4c6e9af4322..5cc43c7d2e9b 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -253,8 +253,14 @@ static const struct f2fs_compress_ops f2fs_lzo_ops = { #ifdef CONFIG_F2FS_FS_LZ4 static int lz4_init_compress_ctx(struct compress_ctx *cc) { - cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode), - LZ4_MEM_COMPRESS, GFP_NOFS); + unsigned int size = LZ4_MEM_COMPRESS; + +#ifdef CONFIG_F2FS_FS_LZ4HC + if (F2FS_I(cc->inode)->i_compress_flag >> COMPRESS_LEVEL_OFFSET) + size = LZ4HC_MEM_COMPRESS; +#endif + + cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode), size, GFP_NOFS); if (!cc->private) return -ENOMEM; @@ -273,10 +279,34 @@ static void lz4_destroy_compress_ctx(struct compress_ctx *cc) cc->private = NULL; } +#ifdef CONFIG_F2FS_FS_LZ4HC +static int lz4hc_compress_pages(struct compress_ctx *cc) +{ + unsigned char level = F2FS_I(cc->inode)->i_compress_flag >> + COMPRESS_LEVEL_OFFSET; + int len; + + if (level) + len = LZ4_compress_HC(cc->rbuf, cc->cbuf->cdata, cc->rlen, + cc->clen, level, cc->private); + else + len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen, + cc->clen, cc->private); + if (!len) + return -EAGAIN; + + cc->clen = len; + return 0; +} +#endif + static int lz4_compress_pages(struct compress_ctx *cc) { int len; +#ifdef CONFIG_F2FS_FS_LZ4HC + return lz4hc_compress_pages(cc); +#endif len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen, cc->clen, cc->private); if (!len) @@ -326,8 +356,13 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc) ZSTD_CStream *stream; void *workspace; unsigned int workspace_size; + unsigned char level = F2FS_I(cc->inode)->i_compress_flag >> + COMPRESS_LEVEL_OFFSET; - params = ZSTD_getParams(F2FS_ZSTD_DEFAULT_CLEVEL, cc->rlen, 0); + if (!level) + level = F2FS_ZSTD_DEFAULT_CLEVEL; + + params = ZSTD_getParams(level, cc->rlen, 0); workspace_size = ZSTD_CStreamWorkspaceBound(params.cParams); workspace = f2fs_kvmalloc(F2FS_I_SB(cc->inode), diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 322b51809a39..18b16deac2d1 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -145,6 +145,7 @@ struct f2fs_mount_info { /* For compression */ unsigned char compress_algorithm; /* algorithm type */ unsigned char compress_log_size; /* cluster log size */ + unsigned char compress_level; /* compress level */ bool compress_chksum; /* compressed data chksum */ unsigned char compress_ext_cnt; /* extension count */ int compress_mode; /* compression mode */ @@ -734,6 +735,7 @@ struct f2fs_inode_info { atomic_t i_compr_blocks; /* # of compressed blocks */ unsigned char i_compress_algorithm; /* algorithm type */ unsigned char i_log_cluster_size; /* log of cluster size */ + unsigned char i_compress_level; /* compress level (lz4hc,zstd) */ unsigned short i_compress_flag; /* compress flag */ unsigned int i_cluster_size; /* cluster size */ }; @@ -1314,6 +1316,8 @@ struct compress_data { #define F2FS_COMPRESSED_PAGE_MAGIC 0xF5F2C000 +#define COMPRESS_LEVEL_OFFSET 8 + /* compress context */ struct compress_ctx { struct inode *inode; /* inode the context belong to */ @@ -3945,6 +3949,11 @@ static inline void set_compress_context(struct inode *inode) 1 << COMPRESS_CHKSUM : 0; F2FS_I(inode)->i_cluster_size = 1 << F2FS_I(inode)->i_log_cluster_size; + if (F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 && + F2FS_OPTION(sbi).compress_level) + F2FS_I(inode)->i_compress_flag |= + F2FS_OPTION(sbi).compress_level << + COMPRESS_LEVEL_OFFSET; F2FS_I(inode)->i_flags |= F2FS_COMPR_FL; set_inode_flag(inode, FI_COMPRESSED_FILE); stat_inc_compr_inode(inode); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e0c4d303ec0f..79da50891961 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -24,6 +24,8 @@ #include #include #include +#include +#include #include "f2fs.h" #include "node.h" @@ -461,6 +463,74 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb, return 0; } +#ifdef CONFIG_F2FS_FS_COMPRESSION +#ifdef CONFIG_F2FS_FS_LZ4 +static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str) +{ +#ifdef CONFIG_F2FS_FS_LZ4HC + unsigned int level; +#endif + + if (strlen(str) == 3) { + F2FS_OPTION(sbi).compress_level = 0; + return 0; + } + +#ifdef CONFIG_F2FS_FS_LZ4HC + str += 3; + + if (str[0] != ':') { + f2fs_info(sbi, "wrong format, e.g. :"); + return -EINVAL; + } + if (kstrtouint(str + 1, 10, &level)) + return -EINVAL; + + if (level < LZ4HC_MIN_CLEVEL || level > LZ4HC_MAX_CLEVEL) { + f2fs_info(sbi, "invalid lz4hc compress level: %d", level); + return -EINVAL; + } + + F2FS_OPTION(sbi).compress_level = level; + return 0; +#else + f2fs_info(sbi, "kernel doesn't support lz4hc compression"); + return -EINVAL; +#endif +} +#endif + +#ifdef CONFIG_F2FS_FS_ZSTD +static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str) +{ + unsigned int level; + int len = 4; + + if (strlen(str) == len) { + F2FS_OPTION(sbi).compress_level = 0; + return 0; + } + + str += len; + + if (str[0] != ':') { + f2fs_info(sbi, "wrong format, e.g. :"); + return -EINVAL; + } + if (kstrtouint(str + 1, 10, &level)) + return -EINVAL; + + if (!level || level > ZSTD_maxCLevel()) { + f2fs_info(sbi, "invalid zstd compress level: %d", level); + return -EINVAL; + } + + F2FS_OPTION(sbi).compress_level = level; + return 0; +} +#endif +#endif + static int parse_options(struct super_block *sb, char *options, bool is_remount) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -873,20 +943,31 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) return -ENOMEM; if (!strcmp(name, "lzo")) { #ifdef CONFIG_F2FS_FS_LZO + F2FS_OPTION(sbi).compress_level = 0; F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZO; #else f2fs_info(sbi, "kernel doesn't support lzo compression"); #endif - } else if (!strcmp(name, "lz4")) { + } else if (!strncmp(name, "lz4", 3)) { #ifdef CONFIG_F2FS_FS_LZ4 + ret = f2fs_set_lz4hc_level(sbi, name); + if (ret) { + kfree(name); + return -EINVAL; + } F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4; #else f2fs_info(sbi, "kernel doesn't support lz4 compression"); #endif - } else if (!strcmp(name, "zstd")) { + } else if (!strncmp(name, "zstd", 4)) { #ifdef CONFIG_F2FS_FS_ZSTD + ret = f2fs_set_zstd_level(sbi, name); + if (ret) { + kfree(name); + return -EINVAL; + } F2FS_OPTION(sbi).compress_algorithm = COMPRESS_ZSTD; #else @@ -1543,6 +1624,9 @@ static inline void f2fs_show_compress_options(struct seq_file *seq, } seq_printf(seq, ",compress_algorithm=%s", algtype); + if (F2FS_OPTION(sbi).compress_level) + seq_printf(seq, ":%d", F2FS_OPTION(sbi).compress_level); + seq_printf(seq, ",compress_log_size=%u", F2FS_OPTION(sbi).compress_log_size); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 7dc2a06cf19a..c6cc0a566ef5 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -274,6 +274,9 @@ struct f2fs_inode { __u8 i_compress_algorithm; /* compress algorithm */ __u8 i_log_cluster_size; /* log of cluster size */ __le16 i_compress_flag; /* compress flag */ + /* 0 bit: chksum flag + * [10,15] bits: compress level + */ __le32 i_extra_end[0]; /* for attribute size calculation */ } __packed; __le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */ From 124a3e77e02c064c7c8bfa4b86d736c752188984 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 9 Dec 2020 16:43:27 +0800 Subject: [PATCH 151/580] f2fs: introduce a new per-sb directory in sysfs Add a new directory 'stat' in path of /sys/fs/f2fs//, later we can add new readonly stat sysfs file into this directory, it will make directory less mess. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 5 +++- fs/f2fs/sysfs.c | 68 +++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 67 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 18b16deac2d1..e97d828e607b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1549,9 +1549,12 @@ struct f2fs_sb_info { unsigned int node_io_flag; /* For sysfs suppport */ - struct kobject s_kobj; + struct kobject s_kobj; /* /sys/fs/f2fs/ */ struct completion s_kobj_unregister; + struct kobject s_stat_kobj; /* /sys/fs/f2fs//stat */ + struct completion s_stat_kobj_unregister; + /* For shrinker support */ struct list_head s_list; int s_ndevs; /* number of devices */ diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index db9c49f9f0de..e670634a8978 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -709,6 +709,10 @@ static struct attribute *f2fs_feat_attrs[] = { NULL, }; +static struct attribute *f2fs_stat_attrs[] = { + NULL, +}; + static const struct sysfs_ops f2fs_attr_ops = { .show = f2fs_attr_show, .store = f2fs_attr_store, @@ -737,6 +741,44 @@ static struct kobject f2fs_feat = { .kset = &f2fs_kset, }; +static ssize_t f2fs_stat_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_stat_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static ssize_t f2fs_stat_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_stat_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->store ? a->store(a, sbi, buf, len) : 0; +} + +static void f2fs_stat_kobj_release(struct kobject *kobj) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_stat_kobj); + complete(&sbi->s_stat_kobj_unregister); +} + +static const struct sysfs_ops f2fs_stat_attr_ops = { + .show = f2fs_stat_attr_show, + .store = f2fs_stat_attr_store, +}; + +static struct kobj_type f2fs_stat_ktype = { + .default_attrs = f2fs_stat_attrs, + .sysfs_ops = &f2fs_stat_attr_ops, + .release = f2fs_stat_kobj_release, +}; + static int __maybe_unused segment_info_seq_show(struct seq_file *seq, void *offset) { @@ -943,11 +985,15 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) init_completion(&sbi->s_kobj_unregister); err = kobject_init_and_add(&sbi->s_kobj, &f2fs_sb_ktype, NULL, "%s", sb->s_id); - if (err) { - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); - return err; - } + if (err) + goto put_sb_kobj; + + sbi->s_stat_kobj.kset = &f2fs_kset; + init_completion(&sbi->s_stat_kobj_unregister); + err = kobject_init_and_add(&sbi->s_stat_kobj, &f2fs_stat_ktype, + &sbi->s_kobj, "stat"); + if (err) + goto put_stat_kobj; if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); @@ -963,6 +1009,13 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) victim_bits_seq_show, sb); } return 0; +put_stat_kobj: + kobject_put(&sbi->s_stat_kobj); + wait_for_completion(&sbi->s_stat_kobj_unregister); +put_sb_kobj: + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + return err; } void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) @@ -974,6 +1027,11 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) remove_proc_entry("victim_bits", sbi->s_proc); remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); } + + kobject_del(&sbi->s_stat_kobj); + kobject_put(&sbi->s_stat_kobj); + wait_for_completion(&sbi->s_stat_kobj_unregister); + kobject_del(&sbi->s_kobj); kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); From 03e6a4a94f47a370c065c59a72ec89bee4240730 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 14 Dec 2020 17:20:57 +0800 Subject: [PATCH 152/580] f2fs: fix to tag FIEMAP_EXTENT_MERGED in f2fs_fiemap() f2fs does not natively support extents in metadata, 'extent' in f2fs is used as a virtual concept, so in f2fs_fiemap() interface, it needs to tag FIEMAP_EXTENT_MERGED flag to indicated the extent status is a result of merging. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9b1b76de66b3..07774b574f54 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1915,6 +1915,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } if (size) { + flags |= FIEMAP_EXTENT_MERGED; if (IS_ENCRYPTED(inode)) flags |= FIEMAP_EXTENT_DATA_ENCRYPTED; From 569a20e378675f18f54619fd7f6e8466d9df3033 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 16 Dec 2020 17:15:23 +0800 Subject: [PATCH 153/580] f2fs: fix out-of-repair __setattr_copy() __setattr_copy() was copied from setattr_copy() in fs/attr.c, there is two missing patches doesn't cover this inner function, fix it. Commit 7fa294c8991c ("userns: Allow chown and setgid preservation") Commit 23adbe12ef7d ("fs,userns: Change inode_capable to capable_wrt_inode_uidgid") Fixes: fbfa2cc58d53 ("f2fs: add file operations") Cc: stable@vger.kernel.org Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 09507d7c4c00..13c00f200029 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -873,7 +873,8 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; - if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + if (!in_group_p(inode->i_gid) && + !capable_wrt_inode_uidgid(inode, CAP_FSETID)) mode &= ~S_ISGID; set_acl_inode(inode, mode); } From 4e99026b3dc5ffdf1a7f15c2ec209300c00dcd88 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 30 Dec 2020 16:38:35 +0800 Subject: [PATCH 154/580] f2fs: trival cleanup in move_data_block() Trival cleanups: - relocate set_summary() before its use - relocate "allocate block address" to correct place - remove unneeded f2fs_wait_on_page_writeback() Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 3ef84e6ded41..39330ad3c44e 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1169,8 +1169,6 @@ static int move_data_block(struct inode *inode, block_t bidx, if (err) goto put_out; - set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); - /* read page */ fio.page = page; fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; @@ -1207,6 +1205,9 @@ static int move_data_block(struct inode *inode, block_t bidx, } } + set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); + + /* allocate block address */ f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, &sum, type, NULL); @@ -1233,9 +1234,6 @@ static int move_data_block(struct inode *inode, block_t bidx, set_page_writeback(fio.encrypted_page); ClearPageError(page); - /* allocate block address */ - f2fs_wait_on_page_writeback(dn.node_page, NODE, true, true); - fio.op = REQ_OP_WRITE; fio.op_flags = REQ_SYNC; fio.new_blkaddr = newaddr; From 314ed3b1794b79002c30feb512da00ff6a69f286 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 4 Jan 2021 22:33:02 -0800 Subject: [PATCH 155/580] f2fs: clean up post-read processing Rework the post-read processing logic to be much easier to understand. At least one bug is fixed by this: if an I/O error occurred when reading from disk, decryption and verity would be performed on the uninitialized data, causing misleading messages in the kernel log. Signed-off-by: Eric Biggers Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 149 ++++++++++++----- fs/f2fs/data.c | 387 ++++++++++++++++++--------------------------- fs/f2fs/f2fs.h | 55 ++++++- 3 files changed, 312 insertions(+), 279 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 5cc43c7d2e9b..c6b1bfacb6d8 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -727,38 +727,27 @@ static int f2fs_compress_pages(struct compress_ctx *cc) return ret; } -void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) +static void f2fs_decompress_cluster(struct decompress_io_ctx *dic) { - struct decompress_io_ctx *dic = - (struct decompress_io_ctx *)page_private(page); struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); - struct f2fs_inode_info *fi= F2FS_I(dic->inode); + struct f2fs_inode_info *fi = F2FS_I(dic->inode); const struct f2fs_compress_ops *cops = f2fs_cops[fi->i_compress_algorithm]; int ret; int i; - dec_page_count(sbi, F2FS_RD_DATA); - - if (bio->bi_status || PageError(page)) - dic->failed = true; - - if (atomic_dec_return(&dic->pending_pages)) - return; - trace_f2fs_decompress_pages_start(dic->inode, dic->cluster_idx, dic->cluster_size, fi->i_compress_algorithm); - /* submit partial compressed pages */ if (dic->failed) { ret = -EIO; - goto out_free_dic; + goto out_end_io; } dic->tpages = page_array_alloc(dic->inode, dic->cluster_size); if (!dic->tpages) { ret = -ENOMEM; - goto out_free_dic; + goto out_end_io; } for (i = 0; i < dic->cluster_size; i++) { @@ -770,20 +759,20 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) dic->tpages[i] = f2fs_compress_alloc_page(); if (!dic->tpages[i]) { ret = -ENOMEM; - goto out_free_dic; + goto out_end_io; } } if (cops->init_decompress_ctx) { ret = cops->init_decompress_ctx(dic); if (ret) - goto out_free_dic; + goto out_end_io; } dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size); if (!dic->rbuf) { ret = -ENOMEM; - goto destroy_decompress_ctx; + goto out_destroy_decompress_ctx; } dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages); @@ -822,18 +811,34 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) vm_unmap_ram(dic->cbuf, dic->nr_cpages); out_vunmap_rbuf: vm_unmap_ram(dic->rbuf, dic->cluster_size); -destroy_decompress_ctx: +out_destroy_decompress_ctx: if (cops->destroy_decompress_ctx) cops->destroy_decompress_ctx(dic); -out_free_dic: - if (!verity) - f2fs_decompress_end_io(dic->rpages, dic->cluster_size, - ret, false); - +out_end_io: trace_f2fs_decompress_pages_end(dic->inode, dic->cluster_idx, dic->clen, ret); - if (!verity) - f2fs_free_dic(dic); + f2fs_decompress_end_io(dic, ret); +} + +/* + * This is called when a page of a compressed cluster has been read from disk + * (or failed to be read from disk). It checks whether this page was the last + * page being waited on in the cluster, and if so, it decompresses the cluster + * (or in the case of a failure, cleans up without actually decompressing). + */ +void f2fs_end_read_compressed_page(struct page *page, bool failed) +{ + struct decompress_io_ctx *dic = + (struct decompress_io_ctx *)page_private(page); + struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); + + dec_page_count(sbi, F2FS_RD_DATA); + + if (failed) + WRITE_ONCE(dic->failed, true); + + if (atomic_dec_and_test(&dic->remaining_pages)) + f2fs_decompress_cluster(dic); } static bool is_page_in_cluster(struct compress_ctx *cc, pgoff_t index) @@ -1500,6 +1505,8 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, return err; } +static void f2fs_free_dic(struct decompress_io_ctx *dic); + struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) { struct decompress_io_ctx *dic; @@ -1518,12 +1525,14 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) dic->magic = F2FS_COMPRESSED_PAGE_MAGIC; dic->inode = cc->inode; - atomic_set(&dic->pending_pages, cc->nr_cpages); + atomic_set(&dic->remaining_pages, cc->nr_cpages); dic->cluster_idx = cc->cluster_idx; dic->cluster_size = cc->cluster_size; dic->log_cluster_size = cc->log_cluster_size; dic->nr_cpages = cc->nr_cpages; + refcount_set(&dic->refcnt, 1); dic->failed = false; + dic->need_verity = f2fs_need_verity(cc->inode, start_idx); for (i = 0; i < dic->cluster_size; i++) dic->rpages[i] = cc->rpages[i]; @@ -1552,7 +1561,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) return ERR_PTR(-ENOMEM); } -void f2fs_free_dic(struct decompress_io_ctx *dic) +static void f2fs_free_dic(struct decompress_io_ctx *dic) { int i; @@ -1580,30 +1589,88 @@ void f2fs_free_dic(struct decompress_io_ctx *dic) kmem_cache_free(dic_entry_slab, dic); } -void f2fs_decompress_end_io(struct page **rpages, - unsigned int cluster_size, bool err, bool verity) +static void f2fs_put_dic(struct decompress_io_ctx *dic) +{ + if (refcount_dec_and_test(&dic->refcnt)) + f2fs_free_dic(dic); +} + +/* + * Update and unlock the cluster's pagecache pages, and release the reference to + * the decompress_io_ctx that was being held for I/O completion. + */ +static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed) { int i; - for (i = 0; i < cluster_size; i++) { - struct page *rpage = rpages[i]; + for (i = 0; i < dic->cluster_size; i++) { + struct page *rpage = dic->rpages[i]; if (!rpage) continue; - if (err || PageError(rpage)) - goto clear_uptodate; - - if (!verity || fsverity_verify_page(rpage)) { + /* PG_error was set if verity failed. */ + if (failed || PageError(rpage)) { + ClearPageUptodate(rpage); + /* will re-read again later */ + ClearPageError(rpage); + } else { SetPageUptodate(rpage); - goto unlock; } -clear_uptodate: - ClearPageUptodate(rpage); - ClearPageError(rpage); -unlock: unlock_page(rpage); } + + f2fs_put_dic(dic); +} + +static void f2fs_verify_cluster(struct work_struct *work) +{ + struct decompress_io_ctx *dic = + container_of(work, struct decompress_io_ctx, verity_work); + int i; + + /* Verify the cluster's decompressed pages with fs-verity. */ + for (i = 0; i < dic->cluster_size; i++) { + struct page *rpage = dic->rpages[i]; + + if (rpage && !fsverity_verify_page(rpage)) + SetPageError(rpage); + } + + __f2fs_decompress_end_io(dic, false); +} + +/* + * This is called when a compressed cluster has been decompressed + * (or failed to be read and/or decompressed). + */ +void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed) +{ + if (!failed && dic->need_verity) { + /* + * Note that to avoid deadlocks, the verity work can't be done + * on the decompression workqueue. This is because verifying + * the data pages can involve reading metadata pages from the + * file, and these metadata pages may be compressed. + */ + INIT_WORK(&dic->verity_work, f2fs_verify_cluster); + fsverity_enqueue_verify_work(&dic->verity_work); + } else { + __f2fs_decompress_end_io(dic, failed); + } +} + +/* + * Put a reference to a compressed page's decompress_io_ctx. + * + * This is called when the page is no longer needed and can be freed. + */ +void f2fs_put_page_dic(struct page *page) +{ + struct decompress_io_ctx *dic = + (struct decompress_io_ctx *)page_private(page); + + f2fs_put_dic(dic); } int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 07774b574f54..1cd764da3247 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -113,10 +113,21 @@ static enum count_type __read_io_type(struct page *page) /* postprocessing steps for read bios */ enum bio_post_read_step { - STEP_DECRYPT, - STEP_DECOMPRESS_NOWQ, /* handle normal cluster data inplace */ - STEP_DECOMPRESS, /* handle compressed cluster data in workqueue */ - STEP_VERITY, +#ifdef CONFIG_FS_ENCRYPTION + STEP_DECRYPT = 1 << 0, +#else + STEP_DECRYPT = 0, /* compile out the decryption-related code */ +#endif +#ifdef CONFIG_F2FS_FS_COMPRESSION + STEP_DECOMPRESS = 1 << 1, +#else + STEP_DECOMPRESS = 0, /* compile out the decompression-related code */ +#endif +#ifdef CONFIG_FS_VERITY + STEP_VERITY = 1 << 2, +#else + STEP_VERITY = 0, /* compile out the verity-related code */ +#endif }; struct bio_post_read_ctx { @@ -126,25 +137,26 @@ struct bio_post_read_ctx { unsigned int enabled_steps; }; -static void __read_end_io(struct bio *bio, bool compr, bool verity) +static void f2fs_finish_read_bio(struct bio *bio) { - struct page *page; struct bio_vec *bv; - int i; + int iter_all; - bio_for_each_segment_all(bv, bio, i) { - page = bv->bv_page; + /* + * Update and unlock the bio's pagecache pages, and put the + * decompression context for any compressed pages. + */ + bio_for_each_segment_all(bv, bio, iter_all) { + struct page *page = bv->bv_page; -#ifdef CONFIG_F2FS_FS_COMPRESSION - if (compr && f2fs_is_compressed_page(page)) { - f2fs_decompress_pages(bio, page, verity); + if (f2fs_is_compressed_page(page)) { + if (bio->bi_status) + f2fs_end_read_compressed_page(page, true); + f2fs_put_page_dic(page); continue; } - if (verity) - continue; -#endif - /* PG_error was set if any post_read step failed */ + /* PG_error was set if decryption or verity failed. */ if (bio->bi_status || PageError(page)) { ClearPageUptodate(page); /* will re-read again later */ @@ -155,106 +167,104 @@ static void __read_end_io(struct bio *bio, bool compr, bool verity) dec_page_count(F2FS_P_SB(page), __read_io_type(page)); unlock_page(page); } + + if (bio->bi_private) + mempool_free(bio->bi_private, bio_post_read_ctx_pool); + bio_put(bio); } -static void f2fs_release_read_bio(struct bio *bio); -static void __f2fs_read_end_io(struct bio *bio, bool compr, bool verity) -{ - if (!compr) - __read_end_io(bio, false, verity); - f2fs_release_read_bio(bio); -} - -static void f2fs_decompress_bio(struct bio *bio, bool verity) -{ - __read_end_io(bio, true, verity); -} - -static void bio_post_read_processing(struct bio_post_read_ctx *ctx); - -static void f2fs_decrypt_work(struct bio_post_read_ctx *ctx) -{ - fscrypt_decrypt_bio(ctx->bio); -} - -static void f2fs_decompress_work(struct bio_post_read_ctx *ctx) -{ - f2fs_decompress_bio(ctx->bio, ctx->enabled_steps & (1 << STEP_VERITY)); -} - -#ifdef CONFIG_F2FS_FS_COMPRESSION -static void f2fs_verify_pages(struct page **rpages, unsigned int cluster_size) -{ - f2fs_decompress_end_io(rpages, cluster_size, false, true); -} - -static void f2fs_verify_bio(struct bio *bio) -{ - struct bio_vec *bv; - int i; - - bio_for_each_segment_all(bv, bio, i) { - struct page *page = bv->bv_page; - struct decompress_io_ctx *dic; - - dic = (struct decompress_io_ctx *)page_private(page); - - if (dic) { - if (atomic_dec_return(&dic->verity_pages)) - continue; - f2fs_verify_pages(dic->rpages, - dic->cluster_size); - f2fs_free_dic(dic); - continue; - } - - if (bio->bi_status || PageError(page)) - goto clear_uptodate; - - if (fsverity_verify_page(page)) { - SetPageUptodate(page); - goto unlock; - } -clear_uptodate: - ClearPageUptodate(page); - ClearPageError(page); -unlock: - dec_page_count(F2FS_P_SB(page), __read_io_type(page)); - unlock_page(page); - } -} -#endif - -static void f2fs_verity_work(struct work_struct *work) +static void f2fs_verify_bio(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); struct bio *bio = ctx->bio; -#ifdef CONFIG_F2FS_FS_COMPRESSION - unsigned int enabled_steps = ctx->enabled_steps; -#endif + bool may_have_compressed_pages = (ctx->enabled_steps & STEP_DECOMPRESS); /* * fsverity_verify_bio() may call readpages() again, and while verity - * will be disabled for this, decryption may still be needed, resulting - * in another bio_post_read_ctx being allocated. So to prevent - * deadlocks we need to release the current ctx to the mempool first. - * This assumes that verity is the last post-read step. + * will be disabled for this, decryption and/or decompression may still + * be needed, resulting in another bio_post_read_ctx being allocated. + * So to prevent deadlocks we need to release the current ctx to the + * mempool first. This assumes that verity is the last post-read step. */ mempool_free(ctx, bio_post_read_ctx_pool); bio->bi_private = NULL; -#ifdef CONFIG_F2FS_FS_COMPRESSION - /* previous step is decompression */ - if (enabled_steps & (1 << STEP_DECOMPRESS)) { - f2fs_verify_bio(bio); - f2fs_release_read_bio(bio); - return; - } -#endif + /* + * Verify the bio's pages with fs-verity. Exclude compressed pages, + * as those were handled separately by f2fs_end_read_compressed_page(). + */ + if (may_have_compressed_pages) { + struct bio_vec *bv; + int iter_all; - fsverity_verify_bio(bio); - __f2fs_read_end_io(bio, false, false); + bio_for_each_segment_all(bv, bio, iter_all) { + struct page *page = bv->bv_page; + + if (!f2fs_is_compressed_page(page) && + !PageError(page) && !fsverity_verify_page(page)) + SetPageError(page); + } + } else { + fsverity_verify_bio(bio); + } + + f2fs_finish_read_bio(bio); +} + +/* + * If the bio's data needs to be verified with fs-verity, then enqueue the + * verity work for the bio. Otherwise finish the bio now. + * + * Note that to avoid deadlocks, the verity work can't be done on the + * decryption/decompression workqueue. This is because verifying the data pages + * can involve reading verity metadata pages from the file, and these verity + * metadata pages may be encrypted and/or compressed. + */ +static void f2fs_verify_and_finish_bio(struct bio *bio) +{ + struct bio_post_read_ctx *ctx = bio->bi_private; + + if (ctx && (ctx->enabled_steps & STEP_VERITY)) { + INIT_WORK(&ctx->work, f2fs_verify_bio); + fsverity_enqueue_verify_work(&ctx->work); + } else { + f2fs_finish_read_bio(bio); + } +} + +/* + * Handle STEP_DECOMPRESS by decompressing any compressed clusters whose last + * remaining page was read by @ctx->bio. + * + * Note that a bio may span clusters (even a mix of compressed and uncompressed + * clusters) or be for just part of a cluster. STEP_DECOMPRESS just indicates + * that the bio includes at least one compressed page. The actual decompression + * is done on a per-cluster basis, not a per-bio basis. + */ +static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx) +{ + struct bio_vec *bv; + int iter_all; + bool all_compressed = true; + + bio_for_each_segment_all(bv, ctx->bio, iter_all) { + struct page *page = bv->bv_page; + + /* PG_error was set if decryption failed. */ + if (f2fs_is_compressed_page(page)) + f2fs_end_read_compressed_page(page, PageError(page)); + else + all_compressed = false; + } + + /* + * Optimization: if all the bio's pages are compressed, then scheduling + * the per-bio verity work is unnecessary, as verity will be fully + * handled at the compression cluster level. + */ + if (all_compressed) + ctx->enabled_steps &= ~STEP_VERITY; } static void f2fs_post_read_work(struct work_struct *work) @@ -262,88 +272,50 @@ static void f2fs_post_read_work(struct work_struct *work) struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); - if (ctx->enabled_steps & (1 << STEP_DECRYPT)) - f2fs_decrypt_work(ctx); + if (ctx->enabled_steps & STEP_DECRYPT) + fscrypt_decrypt_bio(ctx->bio); - if (ctx->enabled_steps & (1 << STEP_DECOMPRESS)) - f2fs_decompress_work(ctx); + if (ctx->enabled_steps & STEP_DECOMPRESS) + f2fs_handle_step_decompress(ctx); - if (ctx->enabled_steps & (1 << STEP_VERITY)) { - INIT_WORK(&ctx->work, f2fs_verity_work); - fsverity_enqueue_verify_work(&ctx->work); - return; - } - - __f2fs_read_end_io(ctx->bio, - ctx->enabled_steps & (1 << STEP_DECOMPRESS), false); -} - -static void f2fs_enqueue_post_read_work(struct f2fs_sb_info *sbi, - struct work_struct *work) -{ - queue_work(sbi->post_read_wq, work); -} - -static void bio_post_read_processing(struct bio_post_read_ctx *ctx) -{ - /* - * We use different work queues for decryption and for verity because - * verity may require reading metadata pages that need decryption, and - * we shouldn't recurse to the same workqueue. - */ - - if (ctx->enabled_steps & (1 << STEP_DECRYPT) || - ctx->enabled_steps & (1 << STEP_DECOMPRESS)) { - INIT_WORK(&ctx->work, f2fs_post_read_work); - f2fs_enqueue_post_read_work(ctx->sbi, &ctx->work); - return; - } - - if (ctx->enabled_steps & (1 << STEP_VERITY)) { - INIT_WORK(&ctx->work, f2fs_verity_work); - fsverity_enqueue_verify_work(&ctx->work); - return; - } - - __f2fs_read_end_io(ctx->bio, false, false); -} - -static bool f2fs_bio_post_read_required(struct bio *bio) -{ - return bio->bi_private; + f2fs_verify_and_finish_bio(ctx->bio); } static void f2fs_read_end_io(struct bio *bio) { struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio)); + struct bio_post_read_ctx *ctx = bio->bi_private; if (time_to_inject(sbi, FAULT_READ_IO)) { f2fs_show_injection_info(sbi, FAULT_READ_IO); bio->bi_status = BLK_STS_IOERR; } - if (f2fs_bio_post_read_required(bio)) { - struct bio_post_read_ctx *ctx = bio->bi_private; - - bio_post_read_processing(ctx); + if (bio->bi_status) { + f2fs_finish_read_bio(bio); return; } - __f2fs_read_end_io(bio, false, false); + if (ctx && (ctx->enabled_steps & (STEP_DECRYPT | STEP_DECOMPRESS))) { + INIT_WORK(&ctx->work, f2fs_post_read_work); + queue_work(ctx->sbi->post_read_wq, &ctx->work); + } else { + f2fs_verify_and_finish_bio(bio); + } } static void f2fs_write_end_io(struct bio *bio) { struct f2fs_sb_info *sbi = bio->bi_private; struct bio_vec *bvec; - int i; + int iter_all; if (time_to_inject(sbi, FAULT_WRITE_IO)) { f2fs_show_injection_info(sbi, FAULT_WRITE_IO); bio->bi_status = BLK_STS_IOERR; } - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter_all) { struct page *page = bvec->bv_page; enum count_type type = WB_DATA_TYPE(page); @@ -565,7 +537,7 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode, struct page *page, nid_t ino) { struct bio_vec *bvec; - int i; + int iter_all; if (!bio) return false; @@ -573,7 +545,7 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode, if (!inode && !page && !ino) return true; - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter_all) { struct page *target = bvec->bv_page; if (fscrypt_is_bounce_page(target)) { @@ -976,16 +948,9 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio) up_write(&io->io_rwsem); } -static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx) -{ - return fsverity_active(inode) && - idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); -} - static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, unsigned nr_pages, unsigned op_flag, - pgoff_t first_idx, bool for_write, - bool for_verity) + pgoff_t first_idx, bool for_write) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio; @@ -1001,13 +966,19 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, bio_set_op_attrs(bio, REQ_OP_READ, op_flag); if (f2fs_encrypted_file(inode)) - post_read_steps |= 1 << STEP_DECRYPT; - if (f2fs_compressed_file(inode)) - post_read_steps |= 1 << STEP_DECOMPRESS_NOWQ; - if (for_verity && f2fs_need_verity(inode, first_idx)) - post_read_steps |= 1 << STEP_VERITY; + post_read_steps |= STEP_DECRYPT; - if (post_read_steps) { + if (f2fs_need_verity(inode, first_idx)) + post_read_steps |= STEP_VERITY; + + /* + * STEP_DECOMPRESS is handled specially, since a compressed file might + * contain both compressed and uncompressed clusters. We'll allocate a + * bio_post_read_ctx if the file is compressed, but the caller is + * responsible for enabling STEP_DECOMPRESS if it's actually needed. + */ + + if (post_read_steps || f2fs_compressed_file(inode)) { /* Due to the mempool, this never fails. */ ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); ctx->bio = bio; @@ -1019,13 +990,6 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, return bio; } -static void f2fs_release_read_bio(struct bio *bio) -{ - if (bio->bi_private) - mempool_free(bio->bi_private, bio_post_read_ctx_pool); - bio_put(bio); -} - /* This can handle encryption stuffs */ static int f2fs_submit_page_read(struct inode *inode, struct page *page, block_t blkaddr, int op_flags, bool for_write) @@ -1034,7 +998,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page, struct bio *bio; bio = f2fs_grab_read_bio(inode, blkaddr, 1, op_flags, - page->index, for_write, true); + page->index, for_write); if (IS_ERR(bio)) return PTR_ERR(bio); @@ -2072,7 +2036,7 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page, if (bio == NULL) { bio = f2fs_grab_read_bio(inode, block_nr, nr_pages, is_readahead ? REQ_RAHEAD : 0, page->index, - false, true); + false); if (IS_ERR(bio)) { ret = PTR_ERR(bio); bio = NULL; @@ -2118,8 +2082,6 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, sector_t last_block_in_file; const unsigned blocksize = blks_to_bytes(inode, 1); struct decompress_io_ctx *dic = NULL; - struct bio_post_read_ctx *ctx; - bool for_verity = false; int i; int ret = 0; @@ -2185,29 +2147,10 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, goto out_put_dnode; } - /* - * It's possible to enable fsverity on the fly when handling a cluster, - * which requires complicated error handling. Instead of adding more - * complexity, let's give a rule where end_io post-processes fsverity - * per cluster. In order to do that, we need to submit bio, if previous - * bio sets a different post-process policy. - */ - if (fsverity_active(cc->inode)) { - atomic_set(&dic->verity_pages, cc->nr_cpages); - for_verity = true; - - if (bio) { - ctx = bio->bi_private; - if (!(ctx->enabled_steps & (1 << STEP_VERITY))) { - __submit_bio(sbi, bio, DATA); - bio = NULL; - } - } - } - for (i = 0; i < dic->nr_cpages; i++) { struct page *page = dic->cpages[i]; block_t blkaddr; + struct bio_post_read_ctx *ctx; blkaddr = data_blkaddr(dn.inode, dn.node_page, dn.ofs_in_node + i + 1); @@ -2222,31 +2165,10 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, if (!bio) { bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages, is_readahead ? REQ_RAHEAD : 0, - page->index, for_write, for_verity); + page->index, for_write); if (IS_ERR(bio)) { - unsigned int remained = dic->nr_cpages - i; - bool release = false; - ret = PTR_ERR(bio); - dic->failed = true; - - if (for_verity) { - if (!atomic_sub_return(remained, - &dic->verity_pages)) - release = true; - } else { - if (!atomic_sub_return(remained, - &dic->pending_pages)) - release = true; - } - - if (release) { - f2fs_decompress_end_io(dic->rpages, - cc->cluster_size, true, - false); - f2fs_free_dic(dic); - } - + f2fs_decompress_end_io(dic, ret); f2fs_put_dnode(&dn); *bio_ret = NULL; return ret; @@ -2258,10 +2180,9 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, if (bio_add_page(bio, page, blocksize, 0) < blocksize) goto submit_and_realloc; - /* tag STEP_DECOMPRESS to handle IO in wq */ ctx = bio->bi_private; - if (!(ctx->enabled_steps & (1 << STEP_DECOMPRESS))) - ctx->enabled_steps |= 1 << STEP_DECOMPRESS; + ctx->enabled_steps |= STEP_DECOMPRESS; + refcount_inc(&dic->refcnt); inc_page_count(sbi, F2FS_RD_DATA); f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE); @@ -2278,7 +2199,13 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, out_put_dnode: f2fs_put_dnode(&dn); out: - f2fs_decompress_end_io(cc->rpages, cc->cluster_size, true, false); + for (i = 0; i < cc->cluster_size; i++) { + if (cc->rpages[i]) { + ClearPageUptodate(cc->rpages[i]); + ClearPageError(cc->rpages[i]); + unlock_page(cc->rpages[i]); + } + } *bio_ret = bio; return ret; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e97d828e607b..537b9365db0e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1345,7 +1345,7 @@ struct compress_io_ctx { atomic_t pending_pages; /* in-flight compressed page count */ }; -/* decompress io context for read IO path */ +/* Context for decompressing one cluster on the read IO path */ struct decompress_io_ctx { u32 magic; /* magic number to indicate page is compressed */ struct inode *inode; /* inode the context belong to */ @@ -1361,11 +1361,37 @@ struct decompress_io_ctx { struct compress_data *cbuf; /* virtual mapped address on cpages */ size_t rlen; /* valid data length in rbuf */ size_t clen; /* valid data length in cbuf */ - atomic_t pending_pages; /* in-flight compressed page count */ - atomic_t verity_pages; /* in-flight page count for verity */ - bool failed; /* indicate IO error during decompression */ + + /* + * The number of compressed pages remaining to be read in this cluster. + * This is initially nr_cpages. It is decremented by 1 each time a page + * has been read (or failed to be read). When it reaches 0, the cluster + * is decompressed (or an error is reported). + * + * If an error occurs before all the pages have been submitted for I/O, + * then this will never reach 0. In this case the I/O submitter is + * responsible for calling f2fs_decompress_end_io() instead. + */ + atomic_t remaining_pages; + + /* + * Number of references to this decompress_io_ctx. + * + * One reference is held for I/O completion. This reference is dropped + * after the pagecache pages are updated and unlocked -- either after + * decompression (and verity if enabled), or after an error. + * + * In addition, each compressed page holds a reference while it is in a + * bio. These references are necessary prevent compressed pages from + * being freed while they are still in a bio. + */ + refcount_t refcnt; + + bool failed; /* IO error occurred before decompression? */ + bool need_verity; /* need fs-verity verification after decompression? */ void *private; /* payload buffer for specified decompression algorithm */ void *private2; /* extra payload buffer */ + struct work_struct verity_work; /* work to verify the decompressed pages */ }; #define NULL_CLUSTER ((unsigned int)(~0)) @@ -3894,7 +3920,7 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page); bool f2fs_is_compress_backend_ready(struct inode *inode); int f2fs_init_compress_mempool(void); void f2fs_destroy_compress_mempool(void); -void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity); +void f2fs_end_read_compressed_page(struct page *page, bool failed); bool f2fs_cluster_is_empty(struct compress_ctx *cc); bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index); void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page); @@ -3907,9 +3933,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, unsigned nr_pages, sector_t *last_block_in_bio, bool is_readahead, bool for_write); struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc); -void f2fs_free_dic(struct decompress_io_ctx *dic); -void f2fs_decompress_end_io(struct page **rpages, - unsigned int cluster_size, bool err, bool verity); +void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed); +void f2fs_put_page_dic(struct page *page); int f2fs_init_compress_ctx(struct compress_ctx *cc); void f2fs_destroy_compress_ctx(struct compress_ctx *cc); void f2fs_init_compress_info(struct f2fs_sb_info *sbi); @@ -3933,6 +3958,14 @@ static inline struct page *f2fs_compress_control_page(struct page *page) } static inline int f2fs_init_compress_mempool(void) { return 0; } static inline void f2fs_destroy_compress_mempool(void) { } +static inline void f2fs_end_read_compressed_page(struct page *page, bool failed) +{ + WARN_ON_ONCE(1); +} +static inline void f2fs_put_page_dic(struct page *page) +{ + WARN_ON_ONCE(1); +} static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { } static inline int __init f2fs_init_compress_cache(void) { return 0; } @@ -4153,6 +4186,12 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, return false; } +static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx) +{ + return fsverity_active(inode) && + idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); +} + #ifdef CONFIG_F2FS_FAULT_INJECTION extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, unsigned int type); From 3708f23a900521d1c57202b36f47ab0dc9efe21a Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Wed, 6 Jan 2021 08:49:28 +0900 Subject: [PATCH 156/580] f2fs: fix null page reference in redirty_blocks By Colin's static analysis, we found out there is a null page reference under low memory situation in redirty_blocks. I've made the page finding loop stop immediately and return an error not to cause further memory pressure when we run into a failure to find a page under low memory condition. Signed-off-by: Daeho Jeong Reported-by: Colin Ian King Fixes: 5fdb322ff2c2 ("f2fs: add F2FS_IOC_DECOMPRESS_FILE and F2FS_IOC_COMPRESS_FILE") Reviewed-by: Colin Ian King Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 13c00f200029..229ebe13294a 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4080,8 +4080,10 @@ static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len) for (i = 0; i < page_len; i++, redirty_idx++) { page = find_lock_page(mapping, redirty_idx); - if (!page) - ret = -ENOENT; + if (!page) { + ret = -ENOMEM; + break; + } set_page_dirty(page); f2fs_put_page(page, 1); f2fs_put_page(page, 0); From 7383fd9bc8810390f9d4ea1909b825fbb224097f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 12 Jan 2021 09:55:09 +0800 Subject: [PATCH 157/580] f2fs: fix to set/clear I_LINKABLE under i_lock fsstress + fault injection test case reports a warning message as below: WARNING: CPU: 13 PID: 6226 at fs/inode.c:361 inc_nlink+0x32/0x40 Call Trace: f2fs_init_inode_metadata+0x25c/0x4a0 [f2fs] f2fs_add_inline_entry+0x153/0x3b0 [f2fs] f2fs_add_dentry+0x75/0x80 [f2fs] f2fs_do_add_link+0x108/0x160 [f2fs] f2fs_rename2+0x6ab/0x14f0 [f2fs] vfs_rename+0x70c/0x940 do_renameat2+0x4d8/0x4f0 __x64_sys_renameat2+0x4b/0x60 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Following race case can cause this: Thread A Kworker - f2fs_rename - f2fs_create_whiteout - __f2fs_tmpfile - f2fs_i_links_write - f2fs_mark_inode_dirty_sync - mark_inode_dirty_sync - writeback_single_inode - __writeback_single_inode - spin_lock(&inode->i_lock) - inode->i_state |= I_LINKABLE - inode->i_state &= ~dirty - spin_unlock(&inode->i_lock) - f2fs_add_link - f2fs_do_add_link - f2fs_add_dentry - f2fs_add_inline_entry - f2fs_init_inode_metadata - f2fs_i_links_write - inc_nlink - WARN_ON(!(inode->i_state & I_LINKABLE)) Fix to add i_lock to avoid i_state update race condition. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 70a8e516fd32..db5d6c666ea2 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -850,7 +850,11 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, if (whiteout) { f2fs_i_links_write(inode, false); + + spin_lock(&inode->i_lock); inode->i_state |= I_LINKABLE; + spin_unlock(&inode->i_lock); + *whiteout = inode; } else { d_tmpfile(dentry, inode); @@ -1036,7 +1040,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, err = f2fs_add_link(old_dentry, whiteout); if (err) goto put_out_dir; + + spin_lock(&whiteout->i_lock); whiteout->i_state &= ~I_LINKABLE; + spin_unlock(&whiteout->i_lock); + iput(whiteout); } From d0b5341c90c294524434442c597504be2bec3674 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 28 Dec 2020 15:25:29 -0800 Subject: [PATCH 158/580] libfs: unexport generic_ci_d_compare() and generic_ci_d_hash() Now that generic_set_encrypted_ci_d_ops() has been added and ext4 and f2fs are using it, it's no longer necessary to export generic_ci_d_compare() and generic_ci_d_hash() to filesystems. Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/libfs.c | 8 +++----- include/linux/fs.h | 5 ----- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/fs/libfs.c b/fs/libfs.c index b82041854991..5743ce49a683 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1277,8 +1277,8 @@ static bool needs_casefold(const struct inode *dir) * * Return: 0 if names match, 1 if mismatch, or -ERRNO */ -int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, - const char *str, const struct qstr *name) +static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, + const char *str, const struct qstr *name) { const struct dentry *parent = READ_ONCE(dentry->d_parent); const struct inode *dir = READ_ONCE(parent->d_inode); @@ -1315,7 +1315,6 @@ int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, return 1; return !!memcmp(str, name->name, len); } -EXPORT_SYMBOL(generic_ci_d_compare); /** * generic_ci_d_hash - generic d_hash implementation for casefolding filesystems @@ -1324,7 +1323,7 @@ EXPORT_SYMBOL(generic_ci_d_compare); * * Return: 0 if hash was successful or unchanged, and -EINVAL on error */ -int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) +static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) { const struct inode *dir = READ_ONCE(dentry->d_inode); struct super_block *sb = dentry->d_sb; @@ -1339,7 +1338,6 @@ int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) return -EINVAL; return 0; } -EXPORT_SYMBOL(generic_ci_d_hash); static const struct dentry_operations generic_ci_dentry_ops = { .d_hash = generic_ci_d_hash, diff --git a/include/linux/fs.h b/include/linux/fs.h index 24e5bc507eca..a54ccfb9ea4a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3251,11 +3251,6 @@ extern int generic_file_fsync(struct file *, loff_t, loff_t, int); extern int generic_check_addressable(unsigned, u64); -#ifdef CONFIG_UNICODE -extern int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str); -extern int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, - const char *str, const struct qstr *name); -#endif extern void generic_set_encrypted_ci_d_ops(struct dentry *dentry); #ifdef CONFIG_MIGRATION From 256b72b2944b247e2c5ab6fa82120d83f08987d0 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 11 Jan 2021 17:42:53 +0800 Subject: [PATCH 159/580] f2fs: compress: fix potential deadlock generic/269 reports a hangtask issue, the root cause is ABBA deadlock described as below: Thread A Thread B - down_write(&sbi->gc_lock) -- A - f2fs_write_data_pages - lock all pages in cluster -- B - f2fs_write_multi_pages - f2fs_write_raw_pages - f2fs_write_single_data_page - f2fs_balance_fs - down_write(&sbi->gc_lock) -- A - f2fs_gc - do_garbage_collect - ra_data_block - pagecache_get_page -- B To fix this, it needs to avoid calling f2fs_balance_fs() if there is still cluster pages been locked in context of cluster writeback, so instead, let's call f2fs_balance_fs() in the end of f2fs_write_raw_pages() when all cluster pages were unlocked. Fixes: 4c8ff7095bef ("f2fs: support data compression") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 5 ++++- fs/f2fs/data.c | 10 ++++++---- fs/f2fs/f2fs.h | 2 +- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index c6b1bfacb6d8..f2918a62989c 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1426,7 +1426,7 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc, ret = f2fs_write_single_data_page(cc->rpages[i], &_submitted, NULL, NULL, wbc, io_type, - compr_blocks); + compr_blocks, false); if (ret) { if (ret == AOP_WRITEPAGE_ACTIVATE) { unlock_page(cc->rpages[i]); @@ -1461,6 +1461,9 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc, *submitted += _submitted; } + + f2fs_balance_fs(F2FS_M_SB(mapping), true); + return 0; out_err: for (++i; i < cc->cluster_size; i++) { diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 1cd764da3247..36efd7771116 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2623,7 +2623,8 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, sector_t *last_block, struct writeback_control *wbc, enum iostat_type io_type, - int compr_blocks) + int compr_blocks, + bool allow_balance) { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -2761,7 +2762,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, } unlock_page(page); if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) && - !F2FS_I(inode)->cp_task) + !F2FS_I(inode)->cp_task && allow_balance) f2fs_balance_fs(sbi, need_balance_fs); if (unlikely(f2fs_cp_error(sbi))) { @@ -2808,7 +2809,7 @@ static int f2fs_write_data_page(struct page *page, #endif return f2fs_write_single_data_page(page, NULL, NULL, NULL, - wbc, FS_DATA_IO, 0); + wbc, FS_DATA_IO, 0, true); } /* @@ -2978,7 +2979,8 @@ static int f2fs_write_cache_pages(struct address_space *mapping, } #endif ret = f2fs_write_single_data_page(page, &submitted, - &bio, &last_block, wbc, io_type, 0); + &bio, &last_block, wbc, io_type, + 0, true); if (ret == AOP_WRITEPAGE_ACTIVATE) unlock_page(page); #ifdef CONFIG_F2FS_FS_COMPRESSION diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 537b9365db0e..a4678bb18d8c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3513,7 +3513,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, struct bio **bio, sector_t *last_block, struct writeback_control *wbc, enum iostat_type io_type, - int compr_blocks); + int compr_blocks, bool allow_balance); void f2fs_invalidate_page(struct page *page, unsigned int offset, unsigned int length); int f2fs_release_page(struct page *page, gfp_t wait); From 79ac6fa9f50da225d0d96b03c9f7a1bd5989cc44 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Wed, 13 Jan 2021 13:21:54 +0800 Subject: [PATCH 160/580] f2fs: fix to use per-inode maxbytes F2FS inode may have different max size, e.g. compressed file have less blkaddr entries in all its direct-node blocks, result in being with less max filesize. So change to use per-inode maxbytes. Suggested-by: Chao Yu Signed-off-by: Chengguang Xu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 9 ++++++--- fs/f2fs/super.c | 12 ++++++++---- 4 files changed, 16 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 36efd7771116..0095a1d52722 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3715,7 +3715,7 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) filemap_write_and_wait(mapping); /* Block number less than F2FS MAX BLOCKS */ - if (unlikely(block >= F2FS_I_SB(inode)->max_file_blocks)) + if (unlikely(block >= max_file_blocks(inode))) goto out; if (f2fs_compressed_file(inode)) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a4678bb18d8c..d9353d976c41 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1478,7 +1478,6 @@ struct f2fs_sb_info { unsigned int total_sections; /* total section count */ unsigned int total_node_count; /* total node block count */ unsigned int total_valid_node_count; /* valid node block count */ - loff_t max_file_blocks; /* max block index of file */ int dir_level; /* directory level */ int readdir_ra; /* readahead inode in readdir */ u64 max_io_bytes; /* max io bytes to merge IOs */ @@ -3275,6 +3274,7 @@ int f2fs_inode_dirtied(struct inode *inode, bool sync); void f2fs_inode_synced(struct inode *inode); int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly); int f2fs_quota_sync(struct super_block *sb, int type); +loff_t max_file_blocks(struct inode *inode); void f2fs_quota_off_umount(struct super_block *sb); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 229ebe13294a..bd6bf9ebf333 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -505,6 +505,9 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) struct inode *inode = file->f_mapping->host; loff_t maxbytes = inode->i_sb->s_maxbytes; + if (f2fs_compressed_file(inode)) + maxbytes = max_file_blocks(inode) << F2FS_BLKSIZE_BITS; + switch (whence) { case SEEK_SET: case SEEK_CUR: @@ -689,7 +692,7 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) free_from = (pgoff_t)F2FS_BLK_ALIGN(from); - if (free_from >= sbi->max_file_blocks) + if (free_from >= max_file_blocks(inode)) goto free_partial; if (lock) @@ -2770,7 +2773,7 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) return -EINVAL; if (unlikely((range.start + range.len) >> PAGE_SHIFT > - sbi->max_file_blocks)) + max_file_blocks(inode))) return -EINVAL; err = mnt_want_write_file(filp); @@ -3333,7 +3336,7 @@ int f2fs_precache_extents(struct inode *inode) map.m_next_extent = &m_next_extent; map.m_seg_type = NO_CHECK_TYPE; map.m_may_create = false; - end = F2FS_I_SB(inode)->max_file_blocks; + end = max_file_blocks(inode); while (map.m_lblk < end) { map.m_len = end - map.m_lblk; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 79da50891961..aaa52d34c3c1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2701,10 +2701,10 @@ static const struct export_operations f2fs_export_ops = { .get_parent = f2fs_get_parent, }; -static loff_t max_file_blocks(void) +loff_t max_file_blocks(struct inode *inode) { loff_t result = 0; - loff_t leaf_count = DEF_ADDRS_PER_BLOCK; + loff_t leaf_count; /* * note: previously, result is equal to (DEF_ADDRS_PER_INODE - @@ -2713,6 +2713,11 @@ static loff_t max_file_blocks(void) * result as zero. */ + if (inode && f2fs_compressed_file(inode)) + leaf_count = ADDRS_PER_BLOCK(inode); + else + leaf_count = DEF_ADDRS_PER_BLOCK; + /* two direct node blocks */ result += (leaf_count * 2); @@ -3589,8 +3594,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (err) goto free_options; - sbi->max_file_blocks = max_file_blocks(); - sb->s_maxbytes = sbi->max_file_blocks << + sb->s_maxbytes = max_file_blocks(NULL) << le32_to_cpu(raw_super->log_blocksize); sb->s_max_links = F2FS_LINK_MAX; From 5e5afb124c4527b04e2d49927e1d039ba4f5d5d0 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 14 Jan 2021 09:41:27 +0800 Subject: [PATCH 161/580] f2fs: introduce sb_status sysfs node Introduce /sys/fs/f2fs//stat/sb_status to show superblock status in real time as a hexadecimal value. value sb status macro description 0x1 SBI_IS_DIRTY, /* dirty flag for checkpoint */ 0x2 SBI_IS_CLOSE, /* specify unmounting */ 0x4 SBI_NEED_FSCK, /* need fsck.f2fs to fix */ 0x8 SBI_POR_DOING, /* recovery is doing or not */ 0x10 SBI_NEED_SB_WRITE, /* need to recover superblock */ 0x20 SBI_NEED_CP, /* need to checkpoint */ 0x40 SBI_IS_SHUTDOWN, /* shutdown by ioctl */ 0x80 SBI_IS_RECOVERED, /* recovered orphan/data */ 0x100 SBI_CP_DISABLED, /* CP was disabled last mount */ 0x200 SBI_CP_DISABLED_QUICK, /* CP was disabled quickly */ 0x400 SBI_QUOTA_NEED_FLUSH, /* need to flush quota info in CP */ 0x800 SBI_QUOTA_SKIP_FLUSH, /* skip flushing quota in current CP */ 0x1000 SBI_QUOTA_NEED_REPAIR, /* quota file may be corrupted */ 0x2000 SBI_IS_RESIZEFS, /* resizefs is in process */ Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 23 +++++++++++++++++++++++ fs/f2fs/sysfs.c | 8 ++++++++ 2 files changed, 31 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 2618587ab55e..abb7717eb31d 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -357,3 +357,26 @@ Description: This gives a control to limit the bio size in f2fs. Default is zero, which will follow underlying block layer limit, whereas, if it has a certain bytes value, f2fs won't submit a bio larger than that size. + +What: /sys/fs/f2fs//stat/sb_status +Date: December 2020 +Contact: "Chao Yu" +Description: Show status of f2fs superblock in real time. + + ====== ===================== ================================= + value sb status macro description + 0x1 SBI_IS_DIRTY dirty flag for checkpoint + 0x2 SBI_IS_CLOSE specify unmounting + 0x4 SBI_NEED_FSCK need fsck.f2fs to fix + 0x8 SBI_POR_DOING recovery is doing or not + 0x10 SBI_NEED_SB_WRITE need to recover superblock + 0x20 SBI_NEED_CP need to checkpoint + 0x40 SBI_IS_SHUTDOWN shutdown by ioctl + 0x80 SBI_IS_RECOVERED recovered orphan/data + 0x100 SBI_CP_DISABLED CP was disabled last mount + 0x200 SBI_CP_DISABLED_QUICK CP was disabled quickly + 0x400 SBI_QUOTA_NEED_FLUSH need to flush quota info in CP + 0x800 SBI_QUOTA_SKIP_FLUSH skip flushing quota in current CP + 0x1000 SBI_QUOTA_NEED_REPAIR quota file may be corrupted + 0x2000 SBI_IS_RESIZEFS resizefs is in process + ====== ===================== ================================= diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index e670634a8978..81840ba130e4 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -101,6 +101,12 @@ static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, sbi->sectors_written_start) >> 1))); } +static ssize_t sb_status_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sprintf(buf, "%lx\n", sbi->s_flag); +} + static ssize_t features_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -709,7 +715,9 @@ static struct attribute *f2fs_feat_attrs[] = { NULL, }; +F2FS_GENERAL_RO_ATTR(sb_status); static struct attribute *f2fs_stat_attrs[] = { + ATTR_LIST(sb_status), NULL, }; From 5ecffc99fb8bc2e491a1eef1ab48c389efa09adb Mon Sep 17 00:00:00 2001 From: Jack Qiu Date: Wed, 13 Jan 2021 17:58:53 +0800 Subject: [PATCH 162/580] f2fs: remove unused stat_{inc, dec}_atomic_write Just clean code, no logical change. Signed-off-by: Jack Qiu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d9353d976c41..d96254494f9d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3759,8 +3759,6 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #define stat_dec_compr_inode(inode) do { } while (0) #define stat_add_compr_blocks(inode, blocks) do { } while (0) #define stat_sub_compr_blocks(inode, blocks) do { } while (0) -#define stat_inc_atomic_write(inode) do { } while (0) -#define stat_dec_atomic_write(inode) do { } while (0) #define stat_update_max_atomic_write(inode) do { } while (0) #define stat_inc_volatile_write(inode) do { } while (0) #define stat_dec_volatile_write(inode) do { } while (0) From ff6bdecfa0ad51922ba6f8ee24b827ed52e5316f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 14 Jan 2021 13:59:09 -0800 Subject: [PATCH 163/580] f2fs: deprecate f2fs_trace_io This patch deprecates f2fs_trace_io, since f2fs uses page->private more broadly, resulting in more buggy cases. Acked-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/Kconfig | 10 --- fs/f2fs/Makefile | 1 - fs/f2fs/checkpoint.c | 3 - fs/f2fs/data.c | 4 -- fs/f2fs/file.c | 2 - fs/f2fs/node.c | 2 - fs/f2fs/segment.c | 3 - fs/f2fs/super.c | 6 -- fs/f2fs/trace.c | 165 ------------------------------------------- fs/f2fs/trace.h | 43 ----------- 10 files changed, 239 deletions(-) delete mode 100644 fs/f2fs/trace.c delete mode 100644 fs/f2fs/trace.h diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 1d2d3f879f6c..5c0c113cf411 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -84,16 +84,6 @@ config F2FS_FS_ENCRYPTION FS_ENCRYPTION. Use CONFIG_FS_ENCRYPTION=y in new config files. -config F2FS_IO_TRACE - bool "F2FS IO tracer" - depends on F2FS_FS - depends on FUNCTION_TRACER - help - F2FS IO trace is based on a function trace, which gathers process - information and block IO patterns in the filesystem level. - - If unsure, say N. - config F2FS_FAULT_INJECTION bool "F2FS fault injection facility" depends on F2FS_FS diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index ee7316b42f69..e5295746208b 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -7,6 +7,5 @@ f2fs-y += shrinker.o extent_cache.o sysfs.o f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o -f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o f2fs-$(CONFIG_FS_VERITY) += verity.o f2fs-$(CONFIG_F2FS_FS_COMPRESSION) += compress.o diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 617d0f6b0836..404420c6801b 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -17,7 +17,6 @@ #include "f2fs.h" #include "node.h" #include "segment.h" -#include "trace.h" #include static struct kmem_cache *ino_entry_slab; @@ -443,7 +442,6 @@ static int f2fs_set_meta_page_dirty(struct page *page) __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); f2fs_set_page_private(page, 0); - f2fs_trace_pid(page); return 1; } return 0; @@ -1017,7 +1015,6 @@ void f2fs_update_dirty_page(struct inode *inode, struct page *page) spin_unlock(&sbi->inode_lock[type]); f2fs_set_page_private(page, 0); - f2fs_trace_pid(page); } void f2fs_remove_dirty_inode(struct inode *inode) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0095a1d52722..b7bcaa7a4113 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -23,7 +23,6 @@ #include "f2fs.h" #include "node.h" #include "segment.h" -#include "trace.h" #include #define NUM_PREALLOC_POST_READ_CTXS 128 @@ -650,7 +649,6 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) return -EFSCORRUPTED; trace_f2fs_submit_page_bio(page, fio); - f2fs_trace_ios(fio, 0); /* Allocate a new bio */ bio = __bio_alloc(fio, 1); @@ -845,7 +843,6 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) return -EFSCORRUPTED; trace_f2fs_submit_page_bio(page, fio); - f2fs_trace_ios(fio, 0); if (bio && !page_is_mergeable(fio->sbi, bio, *fio->last_block, fio->new_blkaddr)) @@ -935,7 +932,6 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio) wbc_account_io(fio->io_wbc, bio_page, PAGE_SIZE); io->last_block_in_bio = fio->new_blkaddr; - f2fs_trace_ios(fio, 0); trace_f2fs_submit_page_write(fio->page, fio); skip: diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index bd6bf9ebf333..3e37198a880d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -29,7 +29,6 @@ #include "xattr.h" #include "acl.h" #include "gc.h" -#include "trace.h" #include #include @@ -369,7 +368,6 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, f2fs_update_time(sbi, REQ_TIME); out: trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret); - f2fs_trace_ios(NULL, 1); return ret; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 5f561eb33d86..0137878cb394 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -17,7 +17,6 @@ #include "node.h" #include "segment.h" #include "xattr.h" -#include "trace.h" #include #define on_f2fs_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock) @@ -2089,7 +2088,6 @@ static int f2fs_set_node_page_dirty(struct page *page) __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); f2fs_set_page_private(page, 0); - f2fs_trace_pid(page); return 1; } return 0; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6ec659bc8d8d..651514e8f226 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -20,7 +20,6 @@ #include "segment.h" #include "node.h" #include "gc.h" -#include "trace.h" #include #define __reverse_ffz(x) __reverse_ffs(~(x)) @@ -187,8 +186,6 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) { struct inmem_pages *new; - f2fs_trace_pid(page); - f2fs_set_page_private(page, ATOMIC_WRITTEN_PAGE); new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index aaa52d34c3c1..1b77db10ced1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -32,7 +32,6 @@ #include "segment.h" #include "xattr.h" #include "gc.h" -#include "trace.h" #define CREATE_TRACE_POINTS #include @@ -1439,8 +1438,6 @@ int f2fs_sync_fs(struct super_block *sb, int sync) err = f2fs_write_checkpoint(sbi, &cpc); up_write(&sbi->gc_lock); } - f2fs_trace_ios(NULL, 1); - return err; } @@ -4073,8 +4070,6 @@ static int __init init_f2fs_fs(void) return -EINVAL; } - f2fs_build_trace_ios(); - err = init_inodecache(); if (err) goto fail; @@ -4167,7 +4162,6 @@ static void __exit exit_f2fs_fs(void) f2fs_destroy_segment_manager_caches(); f2fs_destroy_node_manager_caches(); destroy_inodecache(); - f2fs_destroy_trace_ios(); } module_init(init_f2fs_fs) diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c deleted file mode 100644 index d0ab533a9ce8..000000000000 --- a/fs/f2fs/trace.c +++ /dev/null @@ -1,165 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * f2fs IO tracer - * - * Copyright (c) 2014 Motorola Mobility - * Copyright (c) 2014 Jaegeuk Kim - */ -#include -#include -#include -#include - -#include "f2fs.h" -#include "trace.h" - -static RADIX_TREE(pids, GFP_ATOMIC); -static spinlock_t pids_lock; -static struct last_io_info last_io; - -static inline void __print_last_io(void) -{ - if (!last_io.len) - return; - - trace_printk("%3x:%3x %4x %-16s %2x %5x %5x %12x %4x\n", - last_io.major, last_io.minor, - last_io.pid, "----------------", - last_io.type, - last_io.fio.op, last_io.fio.op_flags, - last_io.fio.new_blkaddr, - last_io.len); - memset(&last_io, 0, sizeof(last_io)); -} - -static int __file_type(struct inode *inode, pid_t pid) -{ - if (f2fs_is_atomic_file(inode)) - return __ATOMIC_FILE; - else if (f2fs_is_volatile_file(inode)) - return __VOLATILE_FILE; - else if (S_ISDIR(inode->i_mode)) - return __DIR_FILE; - else if (inode->i_ino == F2FS_NODE_INO(F2FS_I_SB(inode))) - return __NODE_FILE; - else if (inode->i_ino == F2FS_META_INO(F2FS_I_SB(inode))) - return __META_FILE; - else if (pid) - return __NORMAL_FILE; - else - return __MISC_FILE; -} - -void f2fs_trace_pid(struct page *page) -{ - struct inode *inode = page->mapping->host; - pid_t pid = task_pid_nr(current); - void *p; - - set_page_private(page, (unsigned long)pid); - -retry: - if (radix_tree_preload(GFP_NOFS)) - return; - - spin_lock(&pids_lock); - p = radix_tree_lookup(&pids, pid); - if (p == current) - goto out; - if (p) - radix_tree_delete(&pids, pid); - - if (radix_tree_insert(&pids, pid, current)) { - spin_unlock(&pids_lock); - radix_tree_preload_end(); - cond_resched(); - goto retry; - } - - trace_printk("%3x:%3x %4x %-16s\n", - MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), - pid, current->comm); -out: - spin_unlock(&pids_lock); - radix_tree_preload_end(); -} - -void f2fs_trace_ios(struct f2fs_io_info *fio, int flush) -{ - struct inode *inode; - pid_t pid; - int major, minor; - - if (flush) { - __print_last_io(); - return; - } - - inode = fio->page->mapping->host; - pid = page_private(fio->page); - - major = MAJOR(inode->i_sb->s_dev); - minor = MINOR(inode->i_sb->s_dev); - - if (last_io.major == major && last_io.minor == minor && - last_io.pid == pid && - last_io.type == __file_type(inode, pid) && - last_io.fio.op == fio->op && - last_io.fio.op_flags == fio->op_flags && - last_io.fio.new_blkaddr + last_io.len == - fio->new_blkaddr) { - last_io.len++; - return; - } - - __print_last_io(); - - last_io.major = major; - last_io.minor = minor; - last_io.pid = pid; - last_io.type = __file_type(inode, pid); - last_io.fio = *fio; - last_io.len = 1; - return; -} - -void f2fs_build_trace_ios(void) -{ - spin_lock_init(&pids_lock); -} - -#define PIDVEC_SIZE 128 -static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index, - unsigned int max_items) -{ - struct radix_tree_iter iter; - void **slot; - unsigned int ret = 0; - - if (unlikely(!max_items)) - return 0; - - radix_tree_for_each_slot(slot, &pids, &iter, first_index) { - results[ret] = iter.index; - if (++ret == max_items) - break; - } - return ret; -} - -void f2fs_destroy_trace_ios(void) -{ - pid_t pid[PIDVEC_SIZE]; - pid_t next_pid = 0; - unsigned int found; - - spin_lock(&pids_lock); - while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) { - unsigned idx; - - next_pid = pid[found - 1] + 1; - for (idx = 0; idx < found; idx++) - radix_tree_delete(&pids, pid[idx]); - } - spin_unlock(&pids_lock); -} diff --git a/fs/f2fs/trace.h b/fs/f2fs/trace.h deleted file mode 100644 index 789f6aa727fc..000000000000 --- a/fs/f2fs/trace.h +++ /dev/null @@ -1,43 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * f2fs IO tracer - * - * Copyright (c) 2014 Motorola Mobility - * Copyright (c) 2014 Jaegeuk Kim - */ -#ifndef __F2FS_TRACE_H__ -#define __F2FS_TRACE_H__ - -#ifdef CONFIG_F2FS_IO_TRACE -#include - -enum file_type { - __NORMAL_FILE, - __DIR_FILE, - __NODE_FILE, - __META_FILE, - __ATOMIC_FILE, - __VOLATILE_FILE, - __MISC_FILE, -}; - -struct last_io_info { - int major, minor; - pid_t pid; - enum file_type type; - struct f2fs_io_info fio; - block_t len; -}; - -extern void f2fs_trace_pid(struct page *); -extern void f2fs_trace_ios(struct f2fs_io_info *, int); -extern void f2fs_build_trace_ios(void); -extern void f2fs_destroy_trace_ios(void); -#else -#define f2fs_trace_pid(p) -#define f2fs_trace_ios(i, n) -#define f2fs_build_trace_ios() -#define f2fs_destroy_trace_ios() - -#endif -#endif /* __F2FS_TRACE_H__ */ From 046c59c130006ebc26995a3d70ee0e7eb717b564 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 26 Jan 2021 17:00:42 -0800 Subject: [PATCH 164/580] f2fs: flush data when enabling checkpoint back During checkpoint=disable period, f2fs bypasses all the synchronous IOs such as sync and fsync. So, when enabling it back, we must flush all of them in order to keep the data persistent. Otherwise, suddern power-cut right after enabling checkpoint will cause data loss. Fixes: 4354994f097d ("f2fs: checkpoint disabling") Cc: stable@vger.kernel.org Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1b77db10ced1..c371458c3413 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1876,6 +1876,9 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) { + /* we should flush all the data to keep data consistency */ + sync_inodes_sb(sbi->sb); + down_write(&sbi->gc_lock); f2fs_dirty_to_prefree(sbi); From e7b876906b21f3faea6f651812e0f113d2659071 Mon Sep 17 00:00:00 2001 From: Yi Chen Date: Thu, 28 Jan 2021 17:02:56 +0800 Subject: [PATCH 165/580] f2fs: fix to avoid inconsistent quota data Occasionally, quota data may be corrupted detected by fsck: Info: checkpoint state = 45 : crc compacted_summary unmount [QUOTA WARNING] Usage inconsistent for ID 0:actual (1543036928, 762) != expected (1543032832, 762) [ASSERT] (fsck_chk_quota_files:1986) --> Quota file is missing or invalid quota file content found. [QUOTA WARNING] Usage inconsistent for ID 0:actual (1352478720, 344) != expected (1352474624, 344) [ASSERT] (fsck_chk_quota_files:1986) --> Quota file is missing or invalid quota file content found. [FSCK] Unreachable nat entries [Ok..] [0x0] [FSCK] SIT valid block bitmap checking [Ok..] [FSCK] Hard link checking for regular file [Ok..] [0x0] [FSCK] valid_block_count matching with CP [Ok..] [0xdf299] [FSCK] valid_node_count matcing with CP (de lookup) [Ok..] [0x2b01] [FSCK] valid_node_count matcing with CP (nat lookup) [Ok..] [0x2b01] [FSCK] valid_inode_count matched with CP [Ok..] [0x2665] [FSCK] free segment_count matched with CP [Ok..] [0xcb04] [FSCK] next block offset is free [Ok..] [FSCK] fixing SIT types [FSCK] other corrupted bugs [Fail] The root cause is: If we open file w/ readonly flag, disk quota info won't be initialized for this file, however, following mmap() will force to convert inline inode via f2fs_convert_inline_inode(), which may increase block usage for this inode w/o updating quota data, it causes inconsistent disk quota info. The issue will happen in following stack: open(file, O_RDONLY) mmap(file) - f2fs_convert_inline_inode - f2fs_convert_inline_page - f2fs_reserve_block - f2fs_reserve_new_block - f2fs_reserve_new_blocks - f2fs_i_blocks_write - dquot_claim_block inode->i_blocks increase, but the dqb_curspace keep the size for the dquots is NULL. To fix this issue, let's call dquot_initialize() anyway in both f2fs_truncate() and f2fs_convert_inline_inode() functions to avoid potential inconsistent quota data issue. Fixes: 0abd675e97e6 ("f2fs: support plain user/group quota") Signed-off-by: Daiyue Zhang Signed-off-by: Dehe Gu Signed-off-by: Junchao Jiang Signed-off-by: Ge Qiu Signed-off-by: Yi Chen Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 4 ++++ fs/f2fs/inline.c | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 3e37198a880d..38b06941c9f5 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -790,6 +790,10 @@ int f2fs_truncate(struct inode *inode) return -EIO; } + err = dquot_initialize(inode); + if (err) + return err; + /* we should check inline_data size */ if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 6cd739114c9d..936df9342fcd 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -191,6 +191,10 @@ int f2fs_convert_inline_inode(struct inode *inode) f2fs_hw_is_readonly(sbi) || f2fs_readonly(sbi->sb)) return 0; + err = dquot_initialize(inode); + if (err) + return err; + page = f2fs_grab_cache_page(inode->i_mapping, 0, false); if (!page) return -ENOMEM; From 04dddfa5c2fad441e3ca5d68ce210b556e0442c4 Mon Sep 17 00:00:00 2001 From: Liu Song Date: Sun, 31 Jan 2021 20:26:05 +0800 Subject: [PATCH 166/580] f2fs: remove unnecessary initialization in xattr.c These variables will be explicitly assigned before use, so there is no need to initialize. Signed-off-by: Liu Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 2086bef6c154..8159fae74b9a 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -327,7 +327,7 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, void *last_addr = NULL; nid_t xnid = F2FS_I(inode)->i_xattr_nid; unsigned int inline_size = inline_xattr_size(inode); - int err = 0; + int err; if (!xnid && !inline_size) return -ENODATA; @@ -515,7 +515,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, void *buffer, size_t buffer_size, struct page *ipage) { struct f2fs_xattr_entry *entry = NULL; - int error = 0; + int error; unsigned int size, len; void *base_addr = NULL; int base_size; @@ -562,7 +562,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) struct inode *inode = d_inode(dentry); struct f2fs_xattr_entry *entry; void *base_addr, *last_base_addr; - int error = 0; + int error; size_t rest = buffer_size; down_read(&F2FS_I(inode)->i_xattr_sem); @@ -632,7 +632,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, int found, newsize; size_t len; __u32 new_hsize; - int error = 0; + int error; if (name == NULL) return -EINVAL; From 406c773c874833ab79c92112c31f132bad328b5d Mon Sep 17 00:00:00 2001 From: Dehe Gu Date: Tue, 2 Feb 2021 17:39:22 +0800 Subject: [PATCH 167/580] f2fs: fix a wrong condition in __submit_bio We should use !F2FS_IO_ALIGNED() to check and submit_io directly. Fixes: 8223ecc456d0 ("f2fs: fix to add missing F2FS_IO_ALIGNED() condition") Reviewed-by: Chao Yu Signed-off-by: Dehe Gu Signed-off-by: Ge Qiu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b7bcaa7a4113..21eb4296a56d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -441,7 +441,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, if (f2fs_lfs_mode(sbi) && current->plug) blk_finish_plug(current->plug); - if (F2FS_IO_ALIGNED(sbi)) + if (!F2FS_IO_ALIGNED(sbi)) goto submit_io; start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS; From 84893428bd3dc8f66b08015bc7343be37ec07d3c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 2 Feb 2021 16:03:11 +0800 Subject: [PATCH 168/580] f2fs: relocate inline conversion from mmap() to mkwrite() If there is page fault only for read case on inline inode, we don't need to convert inline inode, instead, let's do conversion for write case. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 38b06941c9f5..7726519d6168 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -72,6 +72,10 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) goto err; } + err = f2fs_convert_inline_inode(inode); + if (err) + goto err; + #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { int ret = f2fs_is_compressed_cluster(inode, page->index); @@ -525,7 +529,6 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); - int err; if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; @@ -533,11 +536,6 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) if (!f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; - /* we don't need to use inline_data strictly */ - err = f2fs_convert_inline_inode(inode); - if (err) - return err; - file_accessed(file); vma->vm_ops = &f2fs_file_vm_ops; set_inode_flag(inode, FI_MMAP_FILE); From 259b547d9abf3bec965021d2912a2dbce82d3e01 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Tue, 19 Jan 2021 09:00:42 +0900 Subject: [PATCH 169/580] f2fs: introduce checkpoint_merge mount option We've added a new mount options, "checkpoint_merge" and "nocheckpoint_merge", which creates a kernel daemon and makes it to merge concurrent checkpoint requests as much as possible to eliminate redundant checkpoint issues. Plus, we can eliminate the sluggish issue caused by slow checkpoint operation when the checkpoint is done in a process context in a cgroup having low i/o budget and cpu shares. To make this do better, we set the default i/o priority of the kernel daemon to "3", to give one higher priority than other kernel threads. The below verification result explains this. The basic idea has come from https://opensource.samsung.com. [Verification] Android Pixel Device(ARM64, 7GB RAM, 256GB UFS) Create two I/O cgroups (fg w/ weight 100, bg w/ wight 20) Set "strict_guarantees" to "1" in BFQ tunables In "fg" cgroup, - thread A => trigger 1000 checkpoint operations "for i in `seq 1 1000`; do touch test_dir1/file; fsync test_dir1; done" - thread B => gererating async. I/O "fio --rw=write --numjobs=1 --bs=128k --runtime=3600 --time_based=1 --filename=test_img --name=test" In "bg" cgroup, - thread C => trigger repeated checkpoint operations "echo $$ > /dev/blkio/bg/tasks; while true; do touch test_dir2/file; fsync test_dir2; done" We've measured thread A's execution time. [ w/o patch ] Elapsed Time: Avg. 68 seconds [ w/ patch ] Elapsed Time: Avg. 48 seconds Reported-by: kernel test robot Reported-by: Dan Carpenter [Jaegeuk Kim: fix the return value in f2fs_start_ckpt_thread, reported by Dan] Signed-off-by: Daeho Jeong Signed-off-by: Sungjong Seo Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 11 ++ fs/f2fs/checkpoint.c | 177 +++++++++++++++++++++++++++++ fs/f2fs/debug.c | 12 ++ fs/f2fs/f2fs.h | 27 +++++ fs/f2fs/super.c | 58 ++++++++-- 5 files changed, 277 insertions(+), 8 deletions(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 5eff4009e77e..f75ec244762f 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -247,6 +247,17 @@ checkpoint=%s[:%u[%]] Set to "disable" to turn off checkpointing. Set to "enabl hide up to all remaining free space. The actual space that would be unusable can be viewed at /sys/fs/f2fs//unusable This space is reclaimed once checkpoint=enable. +checkpoint_merge When checkpoint is enabled, this can be used to create a kernel + daemon and make it to merge concurrent checkpoint requests as + much as possible to eliminate redundant checkpoint issues. Plus, + we can eliminate the sluggish issue caused by slow checkpoint + operation when the checkpoint is done in a process context in + a cgroup having low i/o budget and cpu shares. To make this + do better, we set the default i/o priority of the kernel daemon + to "3", to give one higher priority than other kernel threads. + This is the same way to give a I/O priority to the jbd2 + journaling thread of ext4 filesystem. +nocheckpoint_merge Disable checkpoint merge feature. compress_algorithm=%s Control compress algorithm, currently f2fs supports "lzo", "lz4", "zstd" and "lzo-rle" algorithm. compress_algorithm=%s:%d Control compress algorithm and its compress level, now, only diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 404420c6801b..814276a5e418 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -13,12 +13,15 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" #include "segment.h" #include +#define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) + static struct kmem_cache *ino_entry_slab; struct kmem_cache *f2fs_inode_entry_slab; @@ -1705,3 +1708,177 @@ void f2fs_destroy_checkpoint_caches(void) kmem_cache_destroy(ino_entry_slab); kmem_cache_destroy(f2fs_inode_entry_slab); } + +static int __write_checkpoint_sync(struct f2fs_sb_info *sbi) +{ + struct cp_control cpc = { .reason = CP_SYNC, }; + int err; + + down_write(&sbi->gc_lock); + err = f2fs_write_checkpoint(sbi, &cpc); + up_write(&sbi->gc_lock); + + return err; +} + +static void __checkpoint_and_complete_reqs(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + struct ckpt_req *req, *next; + struct llist_node *dispatch_list; + u64 sum_diff = 0, diff, count = 0; + int ret; + + dispatch_list = llist_del_all(&cprc->issue_list); + if (!dispatch_list) + return; + dispatch_list = llist_reverse_order(dispatch_list); + + ret = __write_checkpoint_sync(sbi); + atomic_inc(&cprc->issued_ckpt); + + llist_for_each_entry_safe(req, next, dispatch_list, llnode) { + diff = (u64)ktime_ms_delta(ktime_get(), req->queue_time); + req->ret = ret; + complete(&req->wait); + + sum_diff += diff; + count++; + } + atomic_sub(count, &cprc->queued_ckpt); + atomic_add(count, &cprc->total_ckpt); + + spin_lock(&cprc->stat_lock); + cprc->cur_time = (unsigned int)div64_u64(sum_diff, count); + if (cprc->peak_time < cprc->cur_time) + cprc->peak_time = cprc->cur_time; + spin_unlock(&cprc->stat_lock); +} + +static int issue_checkpoint_thread(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct ckpt_req_control *cprc = &sbi->cprc_info; + wait_queue_head_t *q = &cprc->ckpt_wait_queue; +repeat: + if (kthread_should_stop()) + return 0; + + sb_start_intwrite(sbi->sb); + + if (!llist_empty(&cprc->issue_list)) + __checkpoint_and_complete_reqs(sbi); + + sb_end_intwrite(sbi->sb); + + wait_event_interruptible(*q, + kthread_should_stop() || !llist_empty(&cprc->issue_list)); + goto repeat; +} + +static void flush_remained_ckpt_reqs(struct f2fs_sb_info *sbi, + struct ckpt_req *wait_req) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + + if (!llist_empty(&cprc->issue_list)) { + __checkpoint_and_complete_reqs(sbi); + } else { + /* already dispatched by issue_checkpoint_thread */ + if (wait_req) + wait_for_completion(&wait_req->wait); + } +} + +static void init_ckpt_req(struct ckpt_req *req) +{ + memset(req, 0, sizeof(struct ckpt_req)); + + init_completion(&req->wait); + req->queue_time = ktime_get(); +} + +int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + struct ckpt_req req; + struct cp_control cpc; + + cpc.reason = __get_cp_reason(sbi); + if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC) { + int ret; + + down_write(&sbi->gc_lock); + ret = f2fs_write_checkpoint(sbi, &cpc); + up_write(&sbi->gc_lock); + + return ret; + } + + if (!cprc->f2fs_issue_ckpt) + return __write_checkpoint_sync(sbi); + + init_ckpt_req(&req); + + llist_add(&req.llnode, &cprc->issue_list); + atomic_inc(&cprc->queued_ckpt); + + /* update issue_list before we wake up issue_checkpoint thread */ + smp_mb(); + + if (waitqueue_active(&cprc->ckpt_wait_queue)) + wake_up(&cprc->ckpt_wait_queue); + + if (cprc->f2fs_issue_ckpt) + wait_for_completion(&req.wait); + else + flush_remained_ckpt_reqs(sbi, &req); + + return req.ret; +} + +int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi) +{ + dev_t dev = sbi->sb->s_bdev->bd_dev; + struct ckpt_req_control *cprc = &sbi->cprc_info; + + if (cprc->f2fs_issue_ckpt) + return 0; + + cprc->f2fs_issue_ckpt = kthread_run(issue_checkpoint_thread, sbi, + "f2fs_ckpt-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(cprc->f2fs_issue_ckpt)) { + cprc->f2fs_issue_ckpt = NULL; + return -ENOMEM; + } + + set_task_ioprio(cprc->f2fs_issue_ckpt, DEFAULT_CHECKPOINT_IOPRIO); + + return 0; +} + +void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + + if (cprc->f2fs_issue_ckpt) { + struct task_struct *ckpt_task = cprc->f2fs_issue_ckpt; + + cprc->f2fs_issue_ckpt = NULL; + kthread_stop(ckpt_task); + + flush_remained_ckpt_reqs(sbi, NULL); + } +} + +void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + + atomic_set(&cprc->issued_ckpt, 0); + atomic_set(&cprc->total_ckpt, 0); + atomic_set(&cprc->queued_ckpt, 0); + init_waitqueue_head(&cprc->ckpt_wait_queue); + init_llist_head(&cprc->issue_list); + spin_lock_init(&cprc->stat_lock); +} diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 197c914119da..91855d5721cd 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -120,6 +120,13 @@ static void update_general_status(struct f2fs_sb_info *sbi) atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks; } + si->nr_issued_ckpt = atomic_read(&sbi->cprc_info.issued_ckpt); + si->nr_total_ckpt = atomic_read(&sbi->cprc_info.total_ckpt); + si->nr_queued_ckpt = atomic_read(&sbi->cprc_info.queued_ckpt); + spin_lock(&sbi->cprc_info.stat_lock); + si->cur_ckpt_time = sbi->cprc_info.cur_time; + si->peak_ckpt_time = sbi->cprc_info.peak_time; + spin_unlock(&sbi->cprc_info.stat_lock); si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); @@ -417,6 +424,11 @@ static int stat_show(struct seq_file *s, void *v) si->meta_count[META_NAT]); seq_printf(s, " - ssa blocks : %u\n", si->meta_count[META_SSA]); + seq_printf(s, "CP merge (Queued: %4d, Issued: %4d, Total: %4d, " + "Cur time: %4d(ms), Peak time: %4d(ms))\n", + si->nr_queued_ckpt, si->nr_issued_ckpt, + si->nr_total_ckpt, si->cur_ckpt_time, + si->peak_ckpt_time); seq_printf(s, "GC calls: %d (BG: %d)\n", si->call_count, si->bg_gc); seq_printf(s, " - data segments : %d (%d)\n", diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d96254494f9d..8f8d39f22023 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -96,6 +96,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_DISABLE_CHECKPOINT 0x02000000 #define F2FS_MOUNT_NORECOVERY 0x04000000 #define F2FS_MOUNT_ATGC 0x08000000 +#define F2FS_MOUNT_MERGE_CHECKPOINT 0x10000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) @@ -266,6 +267,25 @@ struct fsync_node_entry { unsigned int seq_id; /* sequence id */ }; +struct ckpt_req { + struct completion wait; /* completion for checkpoint done */ + struct llist_node llnode; /* llist_node to be linked in wait queue */ + int ret; /* return code of checkpoint */ + ktime_t queue_time; /* request queued time */ +}; + +struct ckpt_req_control { + struct task_struct *f2fs_issue_ckpt; /* checkpoint task */ + wait_queue_head_t ckpt_wait_queue; /* waiting queue for wake-up */ + atomic_t issued_ckpt; /* # of actually issued ckpts */ + atomic_t total_ckpt; /* # of total ckpts */ + atomic_t queued_ckpt; /* # of queued ckpts */ + struct llist_head issue_list; /* list for command issue */ + spinlock_t stat_lock; /* lock for below checkpoint time stats */ + unsigned int cur_time; /* cur wait time in msec for currently issued checkpoint */ + unsigned int peak_time; /* peak wait time in msec until now */ +}; + /* for the bitmap indicate blocks to be discarded */ struct discard_entry { struct list_head list; /* list head */ @@ -1438,6 +1458,7 @@ struct f2fs_sb_info { wait_queue_head_t cp_wait; unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ long interval_time[MAX_TIME]; /* to store thresholds */ + struct ckpt_req_control cprc_info; /* for checkpoint request control */ struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ @@ -3459,6 +3480,10 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi); int __init f2fs_create_checkpoint_caches(void); void f2fs_destroy_checkpoint_caches(void); +int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi); +int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi); +void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi); +void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi); /* * data.c @@ -3574,6 +3599,8 @@ struct f2fs_stat_info { int nr_discarding, nr_discarded; int nr_discard_cmd; unsigned int undiscard_blks; + int nr_issued_ckpt, nr_total_ckpt, nr_queued_ckpt; + unsigned int cur_ckpt_time, peak_ckpt_time; int inline_xattr, inline_inode, inline_dir, append, update, orphans; int compr_inode; unsigned long long compr_blocks; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index c371458c3413..014536d490c1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -142,6 +142,8 @@ enum { Opt_checkpoint_disable_cap, Opt_checkpoint_disable_cap_perc, Opt_checkpoint_enable, + Opt_checkpoint_merge, + Opt_nocheckpoint_merge, Opt_compress_algorithm, Opt_compress_log_size, Opt_compress_extension, @@ -211,6 +213,8 @@ static match_table_t f2fs_tokens = { {Opt_checkpoint_disable_cap, "checkpoint=disable:%u"}, {Opt_checkpoint_disable_cap_perc, "checkpoint=disable:%u%%"}, {Opt_checkpoint_enable, "checkpoint=enable"}, + {Opt_checkpoint_merge, "checkpoint_merge"}, + {Opt_nocheckpoint_merge, "nocheckpoint_merge"}, {Opt_compress_algorithm, "compress_algorithm=%s"}, {Opt_compress_log_size, "compress_log_size=%u"}, {Opt_compress_extension, "compress_extension=%s"}, @@ -931,6 +935,12 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) case Opt_checkpoint_enable: clear_opt(sbi, DISABLE_CHECKPOINT); break; + case Opt_checkpoint_merge: + set_opt(sbi, MERGE_CHECKPOINT); + break; + case Opt_nocheckpoint_merge: + clear_opt(sbi, MERGE_CHECKPOINT); + break; #ifdef CONFIG_F2FS_FS_COMPRESSION case Opt_compress_algorithm: if (!f2fs_sb_has_compression(sbi)) { @@ -1331,6 +1341,12 @@ static void f2fs_put_super(struct super_block *sb) /* prevent remaining shrinker jobs */ mutex_lock(&sbi->umount_mutex); + /* + * flush all issued checkpoints and stop checkpoint issue thread. + * after then, all checkpoints should be done by each process context. + */ + f2fs_stop_ckpt_thread(sbi); + /* * We don't need to do checkpoint when superblock is clean. * But, the previous checkpoint was not done by umount, it needs to do @@ -1429,15 +1445,9 @@ int f2fs_sync_fs(struct super_block *sb, int sync) if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return -EAGAIN; - if (sync) { - struct cp_control cpc; + if (sync) + err = f2fs_issue_checkpoint(sbi); - cpc.reason = __get_cp_reason(sbi); - - down_write(&sbi->gc_lock); - err = f2fs_write_checkpoint(sbi, &cpc); - up_write(&sbi->gc_lock); - } return err; } @@ -1756,6 +1766,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(sbi, DISABLE_CHECKPOINT)) seq_printf(seq, ",checkpoint=disable:%u", F2FS_OPTION(sbi).unusable_cap); + if (test_opt(sbi, MERGE_CHECKPOINT)) + seq_puts(seq, ",checkpoint_merge"); + else + seq_puts(seq, ",nocheckpoint_merge"); if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_POSIX) seq_printf(seq, ",fsync_mode=%s", "posix"); else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) @@ -2037,6 +2051,19 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } } + if (!test_opt(sbi, DISABLE_CHECKPOINT) && + test_opt(sbi, MERGE_CHECKPOINT)) { + err = f2fs_start_ckpt_thread(sbi); + if (err) { + f2fs_err(sbi, + "Failed to start F2FS issue_checkpoint_thread (%d)", + err); + goto restore_gc; + } + } else { + f2fs_stop_ckpt_thread(sbi); + } + /* * We stop issue flush thread if FS is mounted as RO * or if flush_merge is not passed in mount option. @@ -3761,6 +3788,19 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) f2fs_init_fsync_node_info(sbi); + /* setup checkpoint request control and start checkpoint issue thread */ + f2fs_init_ckpt_req_control(sbi); + if (!test_opt(sbi, DISABLE_CHECKPOINT) && + test_opt(sbi, MERGE_CHECKPOINT)) { + err = f2fs_start_ckpt_thread(sbi); + if (err) { + f2fs_err(sbi, + "Failed to start F2FS issue_checkpoint_thread (%d)", + err); + goto stop_ckpt_thread; + } + } + /* setup f2fs internal modules */ err = f2fs_build_segment_manager(sbi); if (err) { @@ -3959,6 +3999,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) free_sm: f2fs_destroy_segment_manager(sbi); f2fs_destroy_post_read_wq(sbi); +stop_ckpt_thread: + f2fs_stop_ckpt_thread(sbi); free_devices: destroy_device_list(sbi); kvfree(sbi->ckpt); From 0dd64c26c1488b4cc483c2d8d799541676a5a6a3 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Thu, 21 Jan 2021 22:45:29 +0900 Subject: [PATCH 170/580] f2fs: add ckpt_thread_ioprio sysfs node Added "ckpt_thread_ioprio" sysfs node to give a way to change checkpoint merge daemon's io priority. Its default value is "be,3", which means "BE" I/O class and I/O priority "3". We can select the class between "rt" and "be", and set the I/O priority within valid range of it. "," delimiter is necessary in between I/O class and priority number. Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 9 ++++ fs/f2fs/checkpoint.c | 3 +- fs/f2fs/f2fs.h | 1 + fs/f2fs/sysfs.c | 55 +++++++++++++++++++++++++ 4 files changed, 67 insertions(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index abb7717eb31d..8c90e0764050 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -380,3 +380,12 @@ Description: Show status of f2fs superblock in real time. 0x1000 SBI_QUOTA_NEED_REPAIR quota file may be corrupted 0x2000 SBI_IS_RESIZEFS resizefs is in process ====== ===================== ================================= + +What: /sys/fs/f2fs//ckpt_thread_ioprio +Date: January 2021 +Contact: "Daeho Jeong" +Description: Give a way to change checkpoint merge daemon's io priority. + Its default value is "be,3", which means "BE" I/O class and + I/O priority "3". We can select the class between "rt" and "be", + and set the I/O priority within valid range of it. "," delimiter + is necessary in between I/O class and priority number. diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 814276a5e418..e67da7ed4d61 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1852,7 +1852,7 @@ int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi) return -ENOMEM; } - set_task_ioprio(cprc->f2fs_issue_ckpt, DEFAULT_CHECKPOINT_IOPRIO); + set_task_ioprio(cprc->f2fs_issue_ckpt, cprc->ckpt_thread_ioprio); return 0; } @@ -1878,6 +1878,7 @@ void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi) atomic_set(&cprc->issued_ckpt, 0); atomic_set(&cprc->total_ckpt, 0); atomic_set(&cprc->queued_ckpt, 0); + cprc->ckpt_thread_ioprio = DEFAULT_CHECKPOINT_IOPRIO; init_waitqueue_head(&cprc->ckpt_wait_queue); init_llist_head(&cprc->issue_list); spin_lock_init(&cprc->stat_lock); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8f8d39f22023..c51cccc9d21b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -276,6 +276,7 @@ struct ckpt_req { struct ckpt_req_control { struct task_struct *f2fs_issue_ckpt; /* checkpoint task */ + int ckpt_thread_ioprio; /* checkpoint merge thread ioprio */ wait_queue_head_t ckpt_wait_queue; /* waiting queue for wake-up */ atomic_t issued_ckpt; /* # of actually issued ckpts */ atomic_t total_ckpt; /* # of total ckpts */ diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 81840ba130e4..d36a5a86b92c 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "f2fs.h" #include "segment.h" @@ -34,6 +35,7 @@ enum { FAULT_INFO_TYPE, /* struct f2fs_fault_info */ #endif RESERVED_BLOCKS, /* struct f2fs_sb_info */ + CPRC_INFO, /* struct ckpt_req_control */ }; struct f2fs_attr { @@ -70,6 +72,8 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) else if (struct_type == STAT_INFO) return (unsigned char *)F2FS_STAT(sbi); #endif + else if (struct_type == CPRC_INFO) + return (unsigned char *)&sbi->cprc_info; return NULL; } @@ -270,6 +274,23 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, return len; } + if (!strcmp(a->attr.name, "ckpt_thread_ioprio")) { + struct ckpt_req_control *cprc = &sbi->cprc_info; + int len = 0; + int class = IOPRIO_PRIO_CLASS(cprc->ckpt_thread_ioprio); + int data = IOPRIO_PRIO_DATA(cprc->ckpt_thread_ioprio); + + if (class == IOPRIO_CLASS_RT) + len += scnprintf(buf + len, PAGE_SIZE - len, "rt,"); + else if (class == IOPRIO_CLASS_BE) + len += scnprintf(buf + len, PAGE_SIZE - len, "be,"); + else + return -EINVAL; + + len += scnprintf(buf + len, PAGE_SIZE - len, "%d\n", data); + return len; + } + ui = (unsigned int *)(ptr + a->offset); return sprintf(buf, "%u\n", *ui); @@ -323,6 +344,38 @@ static ssize_t __sbi_store(struct f2fs_attr *a, return ret ? ret : count; } + if (!strcmp(a->attr.name, "ckpt_thread_ioprio")) { + const char *name = strim((char *)buf); + struct ckpt_req_control *cprc = &sbi->cprc_info; + int class; + long data; + int ret; + + if (!strncmp(name, "rt,", 3)) + class = IOPRIO_CLASS_RT; + else if (!strncmp(name, "be,", 3)) + class = IOPRIO_CLASS_BE; + else + return -EINVAL; + + name += 3; + ret = kstrtol(name, 10, &data); + if (ret) + return ret; + if (data >= IOPRIO_BE_NR || data < 0) + return -EINVAL; + + cprc->ckpt_thread_ioprio = IOPRIO_PRIO_VALUE(class, data); + if (test_opt(sbi, MERGE_CHECKPOINT)) { + ret = set_task_ioprio(cprc->f2fs_issue_ckpt, + cprc->ckpt_thread_ioprio); + if (ret) + return ret; + } + + return count; + } + ui = (unsigned int *)(ptr + a->offset); ret = kstrtoul(skip_spaces(buf), 0, &t); @@ -582,6 +635,7 @@ F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); #endif F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, data_io_flag, data_io_flag); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag); +F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio); F2FS_GENERAL_RO_ATTR(dirty_segments); F2FS_GENERAL_RO_ATTR(free_segments); F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); @@ -667,6 +721,7 @@ static struct attribute *f2fs_attrs[] = { #endif ATTR_LIST(data_io_flag), ATTR_LIST(node_io_flag), + ATTR_LIST(ckpt_thread_ioprio), ATTR_LIST(dirty_segments), ATTR_LIST(free_segments), ATTR_LIST(unusable), From ee6e2341a92e5fb683e2457cf9aa776e83ac66e0 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 8 Feb 2021 13:42:21 -0800 Subject: [PATCH 171/580] f2fs: don't grab superblock freeze for flush/ckpt thread There are controlled by f2fs_freeze(). This fixes xfstests/generic/068 which is stuck at task:f2fs_ckpt-252:3 state:D stack: 0 pid: 5761 ppid: 2 flags:0x00004000 Call Trace: __schedule+0x44c/0x8a0 schedule+0x4f/0xc0 percpu_rwsem_wait+0xd8/0x140 ? percpu_down_write+0xf0/0xf0 __percpu_down_read+0x56/0x70 issue_checkpoint_thread+0x12c/0x160 [f2fs] ? wait_woken+0x80/0x80 kthread+0x114/0x150 ? __checkpoint_and_complete_reqs+0x110/0x110 [f2fs] ? kthread_park+0x90/0x90 ret_from_fork+0x22/0x30 Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 ---- fs/f2fs/segment.c | 4 ---- fs/f2fs/super.c | 4 ++++ 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index e67da7ed4d61..c62be7bf819c 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1764,13 +1764,9 @@ static int issue_checkpoint_thread(void *data) if (kthread_should_stop()) return 0; - sb_start_intwrite(sbi->sb); - if (!llist_empty(&cprc->issue_list)) __checkpoint_and_complete_reqs(sbi); - sb_end_intwrite(sbi->sb); - wait_event_interruptible(*q, kthread_should_stop() || !llist_empty(&cprc->issue_list)); goto repeat; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 651514e8f226..c3988bd6eb80 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -607,8 +607,6 @@ static int issue_flush_thread(void *data) if (kthread_should_stop()) return 0; - sb_start_intwrite(sbi->sb); - if (!llist_empty(&fcc->issue_list)) { struct flush_cmd *cmd, *next; int ret; @@ -629,8 +627,6 @@ static int issue_flush_thread(void *data) fcc->dispatch_list = NULL; } - sb_end_intwrite(sbi->sb); - wait_event_interruptible(*q, kthread_should_stop() || !llist_empty(&fcc->issue_list)); goto repeat; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 014536d490c1..8910cca3bbba 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1463,6 +1463,10 @@ static int f2fs_freeze(struct super_block *sb) /* must be clean, since sync_filesystem() was already called */ if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY)) return -EINVAL; + + /* ensure no checkpoint required */ + if (!llist_empty(&F2FS_SB(sb)->cprc_info.issue_list)) + return -EINVAL; return 0; } From 3bdc297d53fd4dbb882bb4e321f2ed037a354b3d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 12 Feb 2021 14:09:54 -0800 Subject: [PATCH 172/580] f2fs: give a warning only for readonly partition Let's allow mounting readonly partition. We're able to recovery later once we have it as read-write back. Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8910cca3bbba..3cf23968e8a0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3890,12 +3890,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) * previous checkpoint was not done by clean system shutdown. */ if (f2fs_hw_is_readonly(sbi)) { - if (!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { - err = -EROFS; + if (!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) f2fs_err(sbi, "Need to recover fsync data, but write access unavailable"); - goto free_meta; - } - f2fs_info(sbi, "write access unavailable, skipping recovery"); + else + f2fs_info(sbi, "write access unavailable, skipping recovery"); goto reset_checkpoint; } From 0018bbabf3415dd9c682b37497d301c845dd3510 Mon Sep 17 00:00:00 2001 From: Ed Tsai Date: Thu, 4 Feb 2021 21:25:56 +0800 Subject: [PATCH 173/580] Documentation: f2fs: fix typo s/automaic/automatic Fix typo in f2fs documentation. Signed-off-by: Ed Tsai Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index f75ec244762f..81c05baa8312 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -847,7 +847,7 @@ This is the default option. f2fs does automatic compression in the writeback of compression enabled files. 2) compress_mode=user -This disables the automaic compression and gives the user discretion of choosing the +This disables the automatic compression and gives the user discretion of choosing the target file and the timing. The user can do manual compression/decompression on the compression enabled files using F2FS_IOC_DECOMPRESS_FILE and F2FS_IOC_COMPRESS_FILE ioctls like the below. From f224aee7aefeb9794ef9870edbc65d6192db5167 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 21 Jul 2020 15:59:20 -0700 Subject: [PATCH 174/580] fs-verity: use smp_load_acquire() for ->i_verity_info Normally smp_store_release() or cmpxchg_release() is paired with smp_load_acquire(). Sometimes smp_load_acquire() can be replaced with the more lightweight READ_ONCE(). However, for this to be safe, all the published memory must only be accessed in a way that involves the pointer itself. This may not be the case if allocating the object also involves initializing a static or global variable, for example. fsverity_info::tree_params.hash_alg->tfm is a crypto_ahash object that's internal to and is allocated by the crypto subsystem. So by using READ_ONCE() for ->i_verity_info, we're relying on internal implementation details of the crypto subsystem. Remove this fragile assumption by using smp_load_acquire() instead. Also fix the cmpxchg logic to correctly execute an ACQUIRE barrier when losing the cmpxchg race, since cmpxchg doesn't guarantee a memory barrier on failure. (Note: I haven't seen any real-world problems here. This change is just fixing the code to be guaranteed correct and less fragile.) Fixes: fd2d1acfcadf ("fs-verity: add the hook for file ->open()") Link: https://lore.kernel.org/r/20200721225920.114347-6-ebiggers@kernel.org Signed-off-by: Eric Biggers --- fs/verity/open.c | 15 ++++++++++++--- include/linux/fsverity.h | 9 +++++++-- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/fs/verity/open.c b/fs/verity/open.c index d007db0c9304..bfe0280c14e4 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -221,11 +221,20 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode, void fsverity_set_info(struct inode *inode, struct fsverity_info *vi) { /* - * Multiple processes may race to set ->i_verity_info, so use cmpxchg. - * This pairs with the READ_ONCE() in fsverity_get_info(). + * Multiple tasks may race to set ->i_verity_info, so use + * cmpxchg_release(). This pairs with the smp_load_acquire() in + * fsverity_get_info(). I.e., here we publish ->i_verity_info with a + * RELEASE barrier so that other tasks can ACQUIRE it. */ - if (cmpxchg(&inode->i_verity_info, NULL, vi) != NULL) + if (cmpxchg_release(&inode->i_verity_info, NULL, vi) != NULL) { + /* Lost the race, so free the fsverity_info we allocated. */ fsverity_free_info(vi); + /* + * Afterwards, the caller may access ->i_verity_info directly, + * so make sure to ACQUIRE the winning fsverity_info. + */ + (void)fsverity_get_info(inode); + } } void fsverity_free_info(struct fsverity_info *vi) diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h index 78201a6d35f6..c1144a450392 100644 --- a/include/linux/fsverity.h +++ b/include/linux/fsverity.h @@ -115,8 +115,13 @@ struct fsverity_operations { static inline struct fsverity_info *fsverity_get_info(const struct inode *inode) { - /* pairs with the cmpxchg() in fsverity_set_info() */ - return READ_ONCE(inode->i_verity_info); + /* + * Pairs with the cmpxchg_release() in fsverity_set_info(). + * I.e., another task may publish ->i_verity_info concurrently, + * executing a RELEASE barrier. We need to use smp_load_acquire() here + * to safely ACQUIRE the memory the other task published. + */ + return smp_load_acquire(&inode->i_verity_info); } /* enable.c */ From e334646d47af63a1acdf73b3cc5c8dfb0e90d7f6 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 13 Nov 2020 13:19:15 -0800 Subject: [PATCH 175/580] fs-verity: remove filenames from file comments Embedding the file path inside kernel source code files isn't particularly useful as often files are moved around and the paths become incorrect. checkpatch.pl warns about this since v5.10-rc1. Acked-by: Luca Boccassi Link: https://lore.kernel.org/r/20201113211918.71883-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- fs/verity/enable.c | 2 +- fs/verity/hash_algs.c | 2 +- fs/verity/init.c | 2 +- fs/verity/measure.c | 2 +- fs/verity/open.c | 2 +- fs/verity/signature.c | 2 +- fs/verity/verify.c | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/verity/enable.c b/fs/verity/enable.c index d734cebaae70..04c9bba6c28e 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * fs/verity/enable.c: ioctl to enable verity on a file + * Ioctl to enable verity on a file * * Copyright 2019 Google LLC */ diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c index c37e186ebeb6..71d0fccb6d4c 100644 --- a/fs/verity/hash_algs.c +++ b/fs/verity/hash_algs.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * fs/verity/hash_algs.c: fs-verity hash algorithms + * fs-verity hash algorithms * * Copyright 2019 Google LLC */ diff --git a/fs/verity/init.c b/fs/verity/init.c index 94c104e00861..c98b7016f446 100644 --- a/fs/verity/init.c +++ b/fs/verity/init.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * fs/verity/init.c: fs-verity module initialization and logging + * fs-verity module initialization and logging * * Copyright 2019 Google LLC */ diff --git a/fs/verity/measure.c b/fs/verity/measure.c index df409a5682ed..5300b8d38537 100644 --- a/fs/verity/measure.c +++ b/fs/verity/measure.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * fs/verity/measure.c: ioctl to get a verity file's measurement + * Ioctl to get a verity file's measurement * * Copyright 2019 Google LLC */ diff --git a/fs/verity/open.c b/fs/verity/open.c index bfe0280c14e4..a28d5be78a09 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * fs/verity/open.c: opening fs-verity files + * Opening fs-verity files * * Copyright 2019 Google LLC */ diff --git a/fs/verity/signature.c b/fs/verity/signature.c index 9cfd2f5ff57d..5c95db52b744 100644 --- a/fs/verity/signature.c +++ b/fs/verity/signature.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * fs/verity/signature.c: verification of builtin signatures + * Verification of builtin signatures * * Copyright 2019 Google LLC */ diff --git a/fs/verity/verify.c b/fs/verity/verify.c index 0ee4386386ec..817ce4f8cd3c 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * fs/verity/verify.c: data verification functions, i.e. hooks for ->readpages() + * Data verification functions, i.e. hooks for ->readpages() * * Copyright 2019 Google LLC */ From 8284c77ceb0495a5c1bd3416995aac548de344bd Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 13 Nov 2020 13:19:16 -0800 Subject: [PATCH 176/580] fs-verity: rename fsverity_signed_digest to fsverity_formatted_digest The name "struct fsverity_signed_digest" is causing confusion because it isn't actually a signed digest, but rather it's the way that the digest is formatted in order to be signed. Rename it to "struct fsverity_formatted_digest" to prevent this confusion. Also update the struct's comment to clarify that it's specific to the built-in signature verification support and isn't a requirement for all fs-verity users. I'll be renaming this struct in fsverity-utils too. Acked-by: Luca Boccassi Link: https://lore.kernel.org/r/20201113211918.71883-3-ebiggers@kernel.org Signed-off-by: Eric Biggers --- Documentation/filesystems/fsverity.rst | 2 +- fs/verity/fsverity_private.h | 17 ++++++++++++----- fs/verity/signature.c | 2 +- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/Documentation/filesystems/fsverity.rst b/Documentation/filesystems/fsverity.rst index a95536b6443c..b5978f34d6f7 100644 --- a/Documentation/filesystems/fsverity.rst +++ b/Documentation/filesystems/fsverity.rst @@ -372,7 +372,7 @@ kernel. Specifically, it adds support for: File measurements must be signed in the following format, which is similar to the structure used by `FS_IOC_MEASURE_VERITY`_:: - struct fsverity_signed_digest { + struct fsverity_formatted_digest { char magic[8]; /* must be "FSVerity" */ __le16 digest_algorithm; __le16 digest_size; diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index e96d99d5145e..75f8e18b44a5 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -101,12 +101,19 @@ struct fsverity_descriptor { sizeof(struct fsverity_descriptor)) /* - * Format in which verity file measurements are signed. This is the same as - * 'struct fsverity_digest', except here some magic bytes are prepended to - * provide some context about what is being signed in case the same key is used - * for non-fsverity purposes, and here the fields have fixed endianness. + * Format in which verity file measurements are signed in built-in signatures. + * This is the same as 'struct fsverity_digest', except here some magic bytes + * are prepended to provide some context about what is being signed in case the + * same key is used for non-fsverity purposes, and here the fields have fixed + * endianness. + * + * This struct is specific to the built-in signature verification support, which + * is optional. fs-verity users may also verify signatures in userspace, in + * which case userspace is responsible for deciding on what bytes are signed. + * This struct may still be used, but it doesn't have to be. For example, + * userspace could instead use a string like "sha256:$digest_as_hex_string". */ -struct fsverity_signed_digest { +struct fsverity_formatted_digest { char magic[8]; /* must be "FSVerity" */ __le16 digest_algorithm; __le16 digest_size; diff --git a/fs/verity/signature.c b/fs/verity/signature.c index 5c95db52b744..4ac9dff281b5 100644 --- a/fs/verity/signature.c +++ b/fs/verity/signature.c @@ -44,7 +44,7 @@ int fsverity_verify_signature(const struct fsverity_info *vi, const struct inode *inode = vi->inode; const struct fsverity_hash_alg *hash_alg = vi->tree_params.hash_alg; const u32 sig_size = le32_to_cpu(desc->sig_size); - struct fsverity_signed_digest *d; + struct fsverity_formatted_digest *d; int err; if (sig_size == 0) { From 58343491fe0980bd183ad47674e1e9bcbcb50fde Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 13 Nov 2020 13:19:17 -0800 Subject: [PATCH 177/580] fs-verity: rename "file measurement" to "file digest" I originally chose the name "file measurement" to refer to the fs-verity file digest to avoid confusion with traditional full-file digests or with the bare root hash of the Merkle tree. But the name "file measurement" hasn't caught on, and usually people are calling it something else, usually the "file digest". E.g. see "struct fsverity_digest" and "struct fsverity_formatted_digest", the libfsverity_compute_digest() and libfsverity_sign_digest() functions in libfsverity, and the "fsverity digest" command. Having multiple names for the same thing is always confusing. So to hopefully avoid confusion in the future, rename "fs-verity file measurement" to "fs-verity file digest". This leaves FS_IOC_MEASURE_VERITY as the only reference to "measure" in the kernel, which makes some amount of sense since the ioctl is actively "measuring" the file. I'll be renaming this in fsverity-utils too (though similarly the 'fsverity measure' command, which is a wrapper for FS_IOC_MEASURE_VERITY, will stay). Acked-by: Luca Boccassi Link: https://lore.kernel.org/r/20201113211918.71883-4-ebiggers@kernel.org Signed-off-by: Eric Biggers --- Documentation/filesystems/fsverity.rst | 60 +++++++++++++------------- fs/verity/enable.c | 6 +-- fs/verity/fsverity_private.h | 12 +++--- fs/verity/measure.c | 12 +++--- fs/verity/open.c | 22 +++++----- fs/verity/signature.c | 10 ++--- 6 files changed, 61 insertions(+), 61 deletions(-) diff --git a/Documentation/filesystems/fsverity.rst b/Documentation/filesystems/fsverity.rst index b5978f34d6f7..7c23f0b58ee3 100644 --- a/Documentation/filesystems/fsverity.rst +++ b/Documentation/filesystems/fsverity.rst @@ -27,9 +27,9 @@ automatically verified against the file's Merkle tree. Reads of any corrupted data, including mmap reads, will fail. Userspace can use another ioctl to retrieve the root hash (actually -the "file measurement", which is a hash that includes the root hash) -that fs-verity is enforcing for the file. This ioctl executes in -constant time, regardless of the file size. +the "fs-verity file digest", which is a hash that includes the Merkle +tree root hash) that fs-verity is enforcing for the file. This ioctl +executes in constant time, regardless of the file size. fs-verity is essentially a way to hash a file in constant time, subject to the caveat that reads which would violate the hash will @@ -177,9 +177,10 @@ FS_IOC_ENABLE_VERITY can fail with the following errors: FS_IOC_MEASURE_VERITY --------------------- -The FS_IOC_MEASURE_VERITY ioctl retrieves the measurement of a verity -file. The file measurement is a digest that cryptographically -identifies the file contents that are being enforced on reads. +The FS_IOC_MEASURE_VERITY ioctl retrieves the digest of a verity file. +The fs-verity file digest is a cryptographic digest that identifies +the file contents that are being enforced on reads; it is computed via +a Merkle tree and is different from a traditional full-file digest. This ioctl takes in a pointer to a variable-length structure:: @@ -197,7 +198,7 @@ On success, 0 is returned and the kernel fills in the structure as follows: - ``digest_algorithm`` will be the hash algorithm used for the file - measurement. It will match ``fsverity_enable_arg::hash_algorithm``. + digest. It will match ``fsverity_enable_arg::hash_algorithm``. - ``digest_size`` will be the size of the digest in bytes, e.g. 32 for SHA-256. (This can be redundant with ``digest_algorithm``.) - ``digest`` will be the actual bytes of the digest. @@ -257,25 +258,24 @@ non-verity one, with the following exceptions: with EIO (for read()) or SIGBUS (for mmap() reads). - If the sysctl "fs.verity.require_signatures" is set to 1 and the - file's verity measurement is not signed by a key in the fs-verity - keyring, then opening the file will fail. See `Built-in signature - verification`_. + file is not signed by a key in the fs-verity keyring, then opening + the file will fail. See `Built-in signature verification`_. Direct access to the Merkle tree is not supported. Therefore, if a verity file is copied, or is backed up and restored, then it will lose its "verity"-ness. fs-verity is primarily meant for files like executables that are managed by a package manager. -File measurement computation -============================ +File digest computation +======================= This section describes how fs-verity hashes the file contents using a -Merkle tree to produce the "file measurement" which cryptographically -identifies the file contents. This algorithm is the same for all -filesystems that support fs-verity. +Merkle tree to produce the digest which cryptographically identifies +the file contents. This algorithm is the same for all filesystems +that support fs-verity. Userspace only needs to be aware of this algorithm if it needs to -compute the file measurement itself, e.g. in order to sign the file. +compute fs-verity file digests itself, e.g. in order to sign files. .. _fsverity_merkle_tree: @@ -325,9 +325,9 @@ can't a distinguish a large file from a small second file whose data is exactly the top-level hash block of the first file. Ambiguities also arise from the convention of padding to the next block boundary. -To solve this problem, the verity file measurement is actually -computed as a hash of the following structure, which contains the -Merkle tree root hash as well as other fields such as the file size:: +To solve this problem, the fs-verity file digest is actually computed +as a hash of the following structure, which contains the Merkle tree +root hash as well as other fields such as the file size:: struct fsverity_descriptor { __u8 version; /* must be 1 */ @@ -359,18 +359,18 @@ kernel. Specifically, it adds support for: certificates from being added. 2. `FS_IOC_ENABLE_VERITY`_ accepts a pointer to a PKCS#7 formatted - detached signature in DER format of the file measurement. On - success, this signature is persisted alongside the Merkle tree. + detached signature in DER format of the file's fs-verity digest. + On success, this signature is persisted alongside the Merkle tree. Then, any time the file is opened, the kernel will verify the - file's actual measurement against this signature, using the - certificates in the ".fs-verity" keyring. + file's actual digest against this signature, using the certificates + in the ".fs-verity" keyring. 3. A new sysctl "fs.verity.require_signatures" is made available. When set to 1, the kernel requires that all verity files have a - correctly signed file measurement as described in (2). + correctly signed digest as described in (2). -File measurements must be signed in the following format, which is -similar to the structure used by `FS_IOC_MEASURE_VERITY`_:: +fs-verity file digests must be signed in the following format, which +is similar to the structure used by `FS_IOC_MEASURE_VERITY`_:: struct fsverity_formatted_digest { char magic[8]; /* must be "FSVerity" */ @@ -421,8 +421,8 @@ can only be set by `FS_IOC_ENABLE_VERITY`_, and it cannot be cleared. ext4 also supports encryption, which can be used simultaneously with fs-verity. In this case, the plaintext data is verified rather than -the ciphertext. This is necessary in order to make the file -measurement meaningful, since every file is encrypted differently. +the ciphertext. This is necessary in order to make the fs-verity file +digest meaningful, since every file is encrypted differently. ext4 stores the verity metadata (Merkle tree and fsverity_descriptor) past the end of the file, starting at the first 64K boundary beyond @@ -592,8 +592,8 @@ weren't already directly answered in other parts of this document. :Q: Isn't fs-verity useless because the attacker can just modify the hashes in the Merkle tree, which is stored on-disk? :A: To verify the authenticity of an fs-verity file you must verify - the authenticity of the "file measurement", which is basically the - root hash of the Merkle tree. See `Use cases`_. + the authenticity of the "fs-verity file digest", which + incorporates the root hash of the Merkle tree. See `Use cases`_. :Q: Isn't fs-verity useless because the attacker can just replace a verity file with a non-verity one? diff --git a/fs/verity/enable.c b/fs/verity/enable.c index 04c9bba6c28e..835c3399fee5 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -398,9 +398,9 @@ int fsverity_ioctl_enable(struct file *filp, const void __user *uarg) * Some pages of the file may have been evicted from pagecache after * being used in the Merkle tree construction, then read into pagecache * again by another process reading from the file concurrently. Since - * these pages didn't undergo verification against the file measurement - * which fs-verity now claims to be enforcing, we have to wipe the - * pagecache to ensure that all future reads are verified. + * these pages didn't undergo verification against the file digest which + * fs-verity now claims to be enforcing, we have to wipe the pagecache + * to ensure that all future reads are verified. */ filemap_write_and_wait(inode->i_mapping); invalidate_inode_pages2(inode->i_mapping); diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index 75f8e18b44a5..21e9930d65fb 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -67,19 +67,19 @@ struct merkle_tree_params { * When a verity file is first opened, an instance of this struct is allocated * and stored in ->i_verity_info; it remains until the inode is evicted. It * caches information about the Merkle tree that's needed to efficiently verify - * data read from the file. It also caches the file measurement. The Merkle - * tree pages themselves are not cached here, but the filesystem may cache them. + * data read from the file. It also caches the file digest. The Merkle tree + * pages themselves are not cached here, but the filesystem may cache them. */ struct fsverity_info { struct merkle_tree_params tree_params; u8 root_hash[FS_VERITY_MAX_DIGEST_SIZE]; - u8 measurement[FS_VERITY_MAX_DIGEST_SIZE]; + u8 file_digest[FS_VERITY_MAX_DIGEST_SIZE]; const struct inode *inode; }; /* - * Merkle tree properties. The file measurement is the hash of this structure - * excluding the signature and with the sig_size field set to 0. + * Merkle tree properties. The fs-verity file digest is the hash of this + * structure excluding the signature and with the sig_size field set to 0. */ struct fsverity_descriptor { __u8 version; /* must be 1 */ @@ -101,7 +101,7 @@ struct fsverity_descriptor { sizeof(struct fsverity_descriptor)) /* - * Format in which verity file measurements are signed in built-in signatures. + * Format in which fs-verity file digests are signed in built-in signatures. * This is the same as 'struct fsverity_digest', except here some magic bytes * are prepended to provide some context about what is being signed in case the * same key is used for non-fsverity purposes, and here the fields have fixed diff --git a/fs/verity/measure.c b/fs/verity/measure.c index 5300b8d38537..f0d7b30c62db 100644 --- a/fs/verity/measure.c +++ b/fs/verity/measure.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Ioctl to get a verity file's measurement + * Ioctl to get a verity file's digest * * Copyright 2019 Google LLC */ @@ -10,12 +10,12 @@ #include /** - * fsverity_ioctl_measure() - get a verity file's measurement - * @filp: file to get measurement of + * fsverity_ioctl_measure() - get a verity file's digest + * @filp: file to get digest of * @_uarg: user pointer to fsverity_digest * - * Retrieve the file measurement that the kernel is enforcing for reads from a - * verity file. See the "FS_IOC_MEASURE_VERITY" section of + * Retrieve the file digest that the kernel is enforcing for reads from a verity + * file. See the "FS_IOC_MEASURE_VERITY" section of * Documentation/filesystems/fsverity.rst for the documentation. * * Return: 0 on success, -errno on failure @@ -51,7 +51,7 @@ int fsverity_ioctl_measure(struct file *filp, void __user *_uarg) if (copy_to_user(uarg, &arg, sizeof(arg))) return -EFAULT; - if (copy_to_user(uarg->digest, vi->measurement, hash_alg->digest_size)) + if (copy_to_user(uarg->digest, vi->file_digest, hash_alg->digest_size)) return -EFAULT; return 0; diff --git a/fs/verity/open.c b/fs/verity/open.c index a28d5be78a09..228d0eca3e2e 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -124,18 +124,18 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, } /* - * Compute the file measurement by hashing the fsverity_descriptor excluding the + * Compute the file digest by hashing the fsverity_descriptor excluding the * signature and with the sig_size field set to 0. */ -static int compute_file_measurement(struct fsverity_hash_alg *hash_alg, - struct fsverity_descriptor *desc, - u8 *measurement) +static int compute_file_digest(struct fsverity_hash_alg *hash_alg, + struct fsverity_descriptor *desc, + u8 *file_digest) { __le32 sig_size = desc->sig_size; int err; desc->sig_size = 0; - err = fsverity_hash_buffer(hash_alg, desc, sizeof(*desc), measurement); + err = fsverity_hash_buffer(hash_alg, desc, sizeof(*desc), file_digest); desc->sig_size = sig_size; return err; @@ -199,15 +199,15 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode, memcpy(vi->root_hash, desc->root_hash, vi->tree_params.digest_size); - err = compute_file_measurement(vi->tree_params.hash_alg, desc, - vi->measurement); + err = compute_file_digest(vi->tree_params.hash_alg, desc, + vi->file_digest); if (err) { - fsverity_err(inode, "Error %d computing file measurement", err); + fsverity_err(inode, "Error %d computing file digest", err); goto out; } - pr_debug("Computed file measurement: %s:%*phN\n", + pr_debug("Computed file digest: %s:%*phN\n", vi->tree_params.hash_alg->name, - vi->tree_params.digest_size, vi->measurement); + vi->tree_params.digest_size, vi->file_digest); err = fsverity_verify_signature(vi, desc, desc_size); out: @@ -354,7 +354,7 @@ int __init fsverity_init_info_cache(void) { fsverity_info_cachep = KMEM_CACHE_USERCOPY(fsverity_info, SLAB_RECLAIM_ACCOUNT, - measurement); + file_digest); if (!fsverity_info_cachep) return -ENOMEM; return 0; diff --git a/fs/verity/signature.c b/fs/verity/signature.c index 4ac9dff281b5..a8591fbe6fe5 100644 --- a/fs/verity/signature.c +++ b/fs/verity/signature.c @@ -32,8 +32,8 @@ static struct key *fsverity_keyring; * @desc: the file's fsverity_descriptor * @desc_size: size of @desc * - * If the file's fs-verity descriptor includes a signature of the file - * measurement, verify it against the certificates in the fs-verity keyring. + * If the file's fs-verity descriptor includes a signature of the file digest, + * verify it against the certificates in the fs-verity keyring. * * Return: 0 on success (signature valid or not required); -errno on failure */ @@ -67,7 +67,7 @@ int fsverity_verify_signature(const struct fsverity_info *vi, memcpy(d->magic, "FSVerity", 8); d->digest_algorithm = cpu_to_le16(hash_alg - fsverity_hash_algs); d->digest_size = cpu_to_le16(hash_alg->digest_size); - memcpy(d->digest, vi->measurement, hash_alg->digest_size); + memcpy(d->digest, vi->file_digest, hash_alg->digest_size); err = verify_pkcs7_signature(d, sizeof(*d) + hash_alg->digest_size, desc->signature, sig_size, @@ -90,8 +90,8 @@ int fsverity_verify_signature(const struct fsverity_info *vi, return err; } - pr_debug("Valid signature for file measurement %s:%*phN\n", - hash_alg->name, hash_alg->digest_size, vi->measurement); + pr_debug("Valid signature for file digest %s:%*phN\n", + hash_alg->name, hash_alg->digest_size, vi->file_digest); return 0; } From 8a6f3470939e8f3e2e0aa42bdb1e913883f6024f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 13 Nov 2020 13:19:18 -0800 Subject: [PATCH 178/580] fs-verity: move structs needed for file signing to UAPI header Although it isn't used directly by the ioctls, "struct fsverity_descriptor" is required by userspace programs that need to compute fs-verity file digests in a standalone way. Therefore it's also needed to sign files in a standalone way. Similarly, "struct fsverity_formatted_digest" (previously called "struct fsverity_signed_digest" which was misleading) is also needed to sign files if the built-in signature verification is being used. Therefore, move these structs to the UAPI header. While doing this, try to make it clear that the signature-related fields in fsverity_descriptor aren't used in the file digest computation. Acked-by: Luca Boccassi Link: https://lore.kernel.org/r/20201113211918.71883-5-ebiggers@kernel.org Signed-off-by: Eric Biggers --- Documentation/filesystems/fsverity.rst | 6 +--- fs/verity/fsverity_private.h | 37 ------------------- include/uapi/linux/fsverity.h | 49 ++++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 42 deletions(-) diff --git a/Documentation/filesystems/fsverity.rst b/Documentation/filesystems/fsverity.rst index 7c23f0b58ee3..28ec7ba5b6df 100644 --- a/Documentation/filesystems/fsverity.rst +++ b/Documentation/filesystems/fsverity.rst @@ -334,17 +334,13 @@ root hash as well as other fields such as the file size:: __u8 hash_algorithm; /* Merkle tree hash algorithm */ __u8 log_blocksize; /* log2 of size of data and tree blocks */ __u8 salt_size; /* size of salt in bytes; 0 if none */ - __le32 sig_size; /* must be 0 */ + __le32 __reserved_0x04; /* must be 0 */ __le64 data_size; /* size of file the Merkle tree is built over */ __u8 root_hash[64]; /* Merkle tree root hash */ __u8 salt[32]; /* salt prepended to each hashed block */ __u8 __reserved[144]; /* must be 0's */ }; -Note that the ``sig_size`` field must be set to 0 for the purpose of -computing the file measurement, even if a signature was provided (or -will be provided) to `FS_IOC_ENABLE_VERITY`_. - Built-in signature verification =============================== diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index 21e9930d65fb..96f7b332f54f 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -77,49 +77,12 @@ struct fsverity_info { const struct inode *inode; }; -/* - * Merkle tree properties. The fs-verity file digest is the hash of this - * structure excluding the signature and with the sig_size field set to 0. - */ -struct fsverity_descriptor { - __u8 version; /* must be 1 */ - __u8 hash_algorithm; /* Merkle tree hash algorithm */ - __u8 log_blocksize; /* log2 of size of data and tree blocks */ - __u8 salt_size; /* size of salt in bytes; 0 if none */ - __le32 sig_size; /* size of signature in bytes; 0 if none */ - __le64 data_size; /* size of file the Merkle tree is built over */ - __u8 root_hash[64]; /* Merkle tree root hash */ - __u8 salt[32]; /* salt prepended to each hashed block */ - __u8 __reserved[144]; /* must be 0's */ - __u8 signature[]; /* optional PKCS#7 signature */ -}; - /* Arbitrary limit to bound the kmalloc() size. Can be changed. */ #define FS_VERITY_MAX_DESCRIPTOR_SIZE 16384 #define FS_VERITY_MAX_SIGNATURE_SIZE (FS_VERITY_MAX_DESCRIPTOR_SIZE - \ sizeof(struct fsverity_descriptor)) -/* - * Format in which fs-verity file digests are signed in built-in signatures. - * This is the same as 'struct fsverity_digest', except here some magic bytes - * are prepended to provide some context about what is being signed in case the - * same key is used for non-fsverity purposes, and here the fields have fixed - * endianness. - * - * This struct is specific to the built-in signature verification support, which - * is optional. fs-verity users may also verify signatures in userspace, in - * which case userspace is responsible for deciding on what bytes are signed. - * This struct may still be used, but it doesn't have to be. For example, - * userspace could instead use a string like "sha256:$digest_as_hex_string". - */ -struct fsverity_formatted_digest { - char magic[8]; /* must be "FSVerity" */ - __le16 digest_algorithm; - __le16 digest_size; - __u8 digest[]; -}; - /* hash_algs.c */ extern struct fsverity_hash_alg fsverity_hash_algs[]; diff --git a/include/uapi/linux/fsverity.h b/include/uapi/linux/fsverity.h index da0daf6c193b..33f44156f8ea 100644 --- a/include/uapi/linux/fsverity.h +++ b/include/uapi/linux/fsverity.h @@ -34,6 +34,55 @@ struct fsverity_digest { __u8 digest[]; }; +/* + * Struct containing a file's Merkle tree properties. The fs-verity file digest + * is the hash of this struct. A userspace program needs this struct only if it + * needs to compute fs-verity file digests itself, e.g. in order to sign files. + * It isn't needed just to enable fs-verity on a file. + * + * Note: when computing the file digest, 'sig_size' and 'signature' must be left + * zero and empty, respectively. These fields are present only because some + * filesystems reuse this struct as part of their on-disk format. + */ +struct fsverity_descriptor { + __u8 version; /* must be 1 */ + __u8 hash_algorithm; /* Merkle tree hash algorithm */ + __u8 log_blocksize; /* log2 of size of data and tree blocks */ + __u8 salt_size; /* size of salt in bytes; 0 if none */ +#ifdef __KERNEL__ + __le32 sig_size; +#else + __le32 __reserved_0x04; /* must be 0 */ +#endif + __le64 data_size; /* size of file the Merkle tree is built over */ + __u8 root_hash[64]; /* Merkle tree root hash */ + __u8 salt[32]; /* salt prepended to each hashed block */ + __u8 __reserved[144]; /* must be 0's */ +#ifdef __KERNEL__ + __u8 signature[]; +#endif +}; + +/* + * Format in which fs-verity file digests are signed in built-in signatures. + * This is the same as 'struct fsverity_digest', except here some magic bytes + * are prepended to provide some context about what is being signed in case the + * same key is used for non-fsverity purposes, and here the fields have fixed + * endianness. + * + * This struct is specific to the built-in signature verification support, which + * is optional. fs-verity users may also verify signatures in userspace, in + * which case userspace is responsible for deciding on what bytes are signed. + * This struct may still be used, but it doesn't have to be. For example, + * userspace could instead use a string like "sha256:$digest_as_hex_string". + */ +struct fsverity_formatted_digest { + char magic[8]; /* must be "FSVerity" */ + __le16 digest_algorithm; + __le16 digest_size; + __u8 digest[]; +}; + #define FS_IOC_ENABLE_VERITY _IOW('f', 133, struct fsverity_enable_arg) #define FS_IOC_MEASURE_VERITY _IOWR('f', 134, struct fsverity_digest) From 4a7bdcc72022babd33abf2f15cfcc1d57f3a73f5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Nov 2020 09:21:57 +0100 Subject: [PATCH 179/580] f2fs: remove a few bd_part checks bd_part is never NULL for a block device in use by a file system, so remove the checks. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Chao Yu Signed-off-by: Jens Axboe --- fs/f2fs/checkpoint.c | 3 +-- fs/f2fs/sysfs.c | 9 --------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index c62be7bf819c..c7fc8a33616f 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1387,8 +1387,7 @@ static void commit_checkpoint(struct f2fs_sb_info *sbi, static inline u64 get_sectors_written(struct block_device *bdev) { - return bdev->bd_part ? - (u64)part_stat_read(bdev->bd_part, sectors[STAT_WRITE]) : 0; + return (u64)part_stat_read(bdev->bd_part, sectors[STAT_WRITE]); } u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index d36a5a86b92c..dc2b102f3482 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -94,11 +94,6 @@ static ssize_t free_segments_show(struct f2fs_attr *a, static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - struct super_block *sb = sbi->sb; - - if (!sb->s_bdev->bd_part) - return sprintf(buf, "0\n"); - return sprintf(buf, "%llu\n", (unsigned long long)(sbi->kbytes_written + ((f2fs_get_sectors_written(sbi) - @@ -114,12 +109,8 @@ static ssize_t sb_status_show(struct f2fs_attr *a, static ssize_t features_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - struct super_block *sb = sbi->sb; int len = 0; - if (!sb->s_bdev->bd_part) - return sprintf(buf, "0\n"); - if (f2fs_sb_has_encrypt(sbi)) len += scnprintf(buf, PAGE_SIZE - len, "%s", "encryption"); From 206329e22bb90eb383341ed44897012cc478a4da Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Jan 2021 15:52:37 +0100 Subject: [PATCH 180/580] f2fs: use blkdev_issue_flush in __submit_flush_wait Use the blkdev_issue_flush helper instead of duplicating it. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Acked-by: Damien Le Moal Signed-off-by: Jens Axboe --- fs/f2fs/data.c | 3 ++- fs/f2fs/f2fs.h | 1 - fs/f2fs/segment.c | 12 +----------- 3 files changed, 3 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 21eb4296a56d..d638ee8dd225 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -53,7 +53,8 @@ static inline struct bio *__f2fs_bio_alloc(gfp_t gfp_mask, return bio_alloc_bioset(gfp_mask, nr_iovecs, &f2fs_bioset); } -struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool noio) +static struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, + bool noio) { if (noio) { /* No failure on bio allocation */ diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c51cccc9d21b..24dfca37c802 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3491,7 +3491,6 @@ void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi); */ int __init f2fs_init_bioset(void); void f2fs_destroy_bioset(void); -struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool noio); int f2fs_init_bio_entry_cache(void); void f2fs_destroy_bio_entry_cache(void); void f2fs_submit_bio(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c3988bd6eb80..c6ea8bed9502 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -563,17 +563,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) static int __submit_flush_wait(struct f2fs_sb_info *sbi, struct block_device *bdev) { - struct bio *bio; - int ret; - - bio = f2fs_bio_alloc(sbi, 0, false); - if (!bio) - return -ENOMEM; - - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; - bio_set_dev(bio, bdev); - ret = submit_bio_wait(bio); - bio_put(bio); + int ret = blkdev_issue_flush(bdev, GFP_NOFS, NULL); trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER), test_opt(sbi, FLUSH_MERGE), ret); From aa977bd3d85086a3b2f2a26c802df21006f25693 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Jan 2021 15:52:38 +0100 Subject: [PATCH 181/580] f2fs: remove FAULT_ALLOC_BIO Sleeping bio allocations do not fail, which means that injecting an error into sleeping bio allocations is a little silly. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Acked-by: Damien Le Moal Signed-off-by: Jens Axboe --- Documentation/filesystems/f2fs.rst | 1 - fs/f2fs/data.c | 29 ++++------------------------- fs/f2fs/f2fs.h | 1 - fs/f2fs/super.c | 1 - 4 files changed, 4 insertions(+), 28 deletions(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 81c05baa8312..35ed01a5fbc9 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -179,7 +179,6 @@ fault_type=%d Support configuring fault injection type, should be FAULT_KVMALLOC 0x000000002 FAULT_PAGE_ALLOC 0x000000004 FAULT_PAGE_GET 0x000000008 - FAULT_ALLOC_BIO 0x000000010 FAULT_ALLOC_NID 0x000000020 FAULT_ORPHAN 0x000000040 FAULT_BLOCK 0x000000080 diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d638ee8dd225..e1352ec0f210 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -47,28 +47,6 @@ void f2fs_destroy_bioset(void) bioset_exit(&f2fs_bioset); } -static inline struct bio *__f2fs_bio_alloc(gfp_t gfp_mask, - unsigned int nr_iovecs) -{ - return bio_alloc_bioset(gfp_mask, nr_iovecs, &f2fs_bioset); -} - -static struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, - bool noio) -{ - if (noio) { - /* No failure on bio allocation */ - return __f2fs_bio_alloc(GFP_NOIO, npages); - } - - if (time_to_inject(sbi, FAULT_ALLOC_BIO)) { - f2fs_show_injection_info(sbi, FAULT_ALLOC_BIO); - return NULL; - } - - return __f2fs_bio_alloc(GFP_KERNEL, npages); -} - static bool __is_cp_guaranteed(struct page *page) { struct address_space *mapping = page->mapping; @@ -412,7 +390,7 @@ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) struct f2fs_sb_info *sbi = fio->sbi; struct bio *bio; - bio = f2fs_bio_alloc(sbi, npages, true); + bio = bio_alloc_bioset(GFP_NOIO, npages, &f2fs_bioset); f2fs_target_device(sbi, fio->new_blkaddr, bio); if (is_read_io(fio->op)) { @@ -954,8 +932,9 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, struct bio_post_read_ctx *ctx; unsigned int post_read_steps = 0; - bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), - for_write); + bio = bio_alloc_bioset(for_write ? GFP_NOIO : GFP_KERNEL, + min_t(int, nr_pages, BIO_MAX_PAGES), + &f2fs_bioset); if (!bio) return ERR_PTR(-ENOMEM); f2fs_target_device(sbi, blkaddr, bio); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 24dfca37c802..392bfd8acd34 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -42,7 +42,6 @@ enum { FAULT_KVMALLOC, FAULT_PAGE_ALLOC, FAULT_PAGE_GET, - FAULT_ALLOC_BIO, FAULT_ALLOC_NID, FAULT_ORPHAN, FAULT_BLOCK, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3cf23968e8a0..f43fad1cb666 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -45,7 +45,6 @@ const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_KVMALLOC] = "kvmalloc", [FAULT_PAGE_ALLOC] = "page alloc", [FAULT_PAGE_GET] = "page get", - [FAULT_ALLOC_BIO] = "alloc bio", [FAULT_ALLOC_NID] = "alloc nid", [FAULT_ORPHAN] = "orphan", [FAULT_BLOCK] = "no more block", From 9112146082565dc7e9e9f6e2ef44bf45bea13c48 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 15 Jan 2021 10:18:14 -0800 Subject: [PATCH 182/580] fs-verity: factor out fsverity_get_descriptor() The FS_IOC_READ_VERITY_METADATA ioctl will need to return the fs-verity descriptor (and signature) to userspace. There are a few ways we could implement this: - Save a copy of the descriptor (and signature) in the fsverity_info struct that hangs off of the in-memory inode. However, this would waste memory since most of the time it wouldn't be needed. - Regenerate the descriptor from the merkle_tree_params in the fsverity_info. However, this wouldn't work for the signature, nor for the salt which the merkle_tree_params only contains indirectly as part of the 'hashstate'. It would also be error-prone. - Just get them from the filesystem again. The disadvantage is that in general we can't trust that they haven't been maliciously changed since the file has opened. However, the use cases for FS_IOC_READ_VERITY_METADATA don't require that it verifies the chain of trust. So this is okay as long as we do some basic validation. In preparation for implementing the third option, factor out a helper function fsverity_get_descriptor() which gets the descriptor (and appended signature) from the filesystem and does some basic validation. As part of this, start checking the sig_size field for overflow. Currently fsverity_verify_signature() does this. But the new ioctl will need this too, so do it earlier. Link: https://lore.kernel.org/r/20210115181819.34732-2-ebiggers@kernel.org Reviewed-by: Victor Hsieh Reviewed-by: Jaegeuk Kim Reviewed-by: Chao Yu Signed-off-by: Eric Biggers --- fs/verity/fsverity_private.h | 7 +- fs/verity/open.c | 130 +++++++++++++++++++++++------------ 2 files changed, 91 insertions(+), 46 deletions(-) diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index 96f7b332f54f..a647b761750a 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -122,12 +122,17 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, const u8 *salt, size_t salt_size); struct fsverity_info *fsverity_create_info(const struct inode *inode, - void *desc, size_t desc_size); + struct fsverity_descriptor *desc, + size_t desc_size); void fsverity_set_info(struct inode *inode, struct fsverity_info *vi); void fsverity_free_info(struct fsverity_info *vi); +int fsverity_get_descriptor(struct inode *inode, + struct fsverity_descriptor **desc_ret, + size_t *desc_size_ret); + int __init fsverity_init_info_cache(void); void __init fsverity_exit_info_cache(void); diff --git a/fs/verity/open.c b/fs/verity/open.c index 228d0eca3e2e..a987bb785e9b 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -142,45 +142,17 @@ static int compute_file_digest(struct fsverity_hash_alg *hash_alg, } /* - * Validate the given fsverity_descriptor and create a new fsverity_info from - * it. The signature (if present) is also checked. + * Create a new fsverity_info from the given fsverity_descriptor (with optional + * appended signature), and check the signature if present. The + * fsverity_descriptor must have already undergone basic validation. */ struct fsverity_info *fsverity_create_info(const struct inode *inode, - void *_desc, size_t desc_size) + struct fsverity_descriptor *desc, + size_t desc_size) { - struct fsverity_descriptor *desc = _desc; struct fsverity_info *vi; int err; - if (desc_size < sizeof(*desc)) { - fsverity_err(inode, "Unrecognized descriptor size: %zu bytes", - desc_size); - return ERR_PTR(-EINVAL); - } - - if (desc->version != 1) { - fsverity_err(inode, "Unrecognized descriptor version: %u", - desc->version); - return ERR_PTR(-EINVAL); - } - - if (memchr_inv(desc->__reserved, 0, sizeof(desc->__reserved))) { - fsverity_err(inode, "Reserved bits set in descriptor"); - return ERR_PTR(-EINVAL); - } - - if (desc->salt_size > sizeof(desc->salt)) { - fsverity_err(inode, "Invalid salt_size: %u", desc->salt_size); - return ERR_PTR(-EINVAL); - } - - if (le64_to_cpu(desc->data_size) != inode->i_size) { - fsverity_err(inode, - "Wrong data_size: %llu (desc) != %lld (inode)", - le64_to_cpu(desc->data_size), inode->i_size); - return ERR_PTR(-EINVAL); - } - vi = kmem_cache_zalloc(fsverity_info_cachep, GFP_KERNEL); if (!vi) return ERR_PTR(-ENOMEM); @@ -245,15 +217,57 @@ void fsverity_free_info(struct fsverity_info *vi) kmem_cache_free(fsverity_info_cachep, vi); } -/* Ensure the inode has an ->i_verity_info */ -static int ensure_verity_info(struct inode *inode) +static bool validate_fsverity_descriptor(struct inode *inode, + const struct fsverity_descriptor *desc, + size_t desc_size) { - struct fsverity_info *vi = fsverity_get_info(inode); - struct fsverity_descriptor *desc; - int res; + if (desc_size < sizeof(*desc)) { + fsverity_err(inode, "Unrecognized descriptor size: %zu bytes", + desc_size); + return false; + } - if (vi) - return 0; + if (desc->version != 1) { + fsverity_err(inode, "Unrecognized descriptor version: %u", + desc->version); + return false; + } + + if (memchr_inv(desc->__reserved, 0, sizeof(desc->__reserved))) { + fsverity_err(inode, "Reserved bits set in descriptor"); + return false; + } + + if (desc->salt_size > sizeof(desc->salt)) { + fsverity_err(inode, "Invalid salt_size: %u", desc->salt_size); + return false; + } + + if (le64_to_cpu(desc->data_size) != inode->i_size) { + fsverity_err(inode, + "Wrong data_size: %llu (desc) != %lld (inode)", + le64_to_cpu(desc->data_size), inode->i_size); + return false; + } + + if (le32_to_cpu(desc->sig_size) > desc_size - sizeof(*desc)) { + fsverity_err(inode, "Signature overflows verity descriptor"); + return false; + } + + return true; +} + +/* + * Read the inode's fsverity_descriptor (with optional appended signature) from + * the filesystem, and do basic validation of it. + */ +int fsverity_get_descriptor(struct inode *inode, + struct fsverity_descriptor **desc_ret, + size_t *desc_size_ret) +{ + int res; + struct fsverity_descriptor *desc; res = inode->i_sb->s_vop->get_verity_descriptor(inode, NULL, 0); if (res < 0) { @@ -272,20 +286,46 @@ static int ensure_verity_info(struct inode *inode) res = inode->i_sb->s_vop->get_verity_descriptor(inode, desc, res); if (res < 0) { fsverity_err(inode, "Error %d reading verity descriptor", res); - goto out_free_desc; + kfree(desc); + return res; } - vi = fsverity_create_info(inode, desc, res); + if (!validate_fsverity_descriptor(inode, desc, res)) { + kfree(desc); + return -EINVAL; + } + + *desc_ret = desc; + *desc_size_ret = res; + return 0; +} + +/* Ensure the inode has an ->i_verity_info */ +static int ensure_verity_info(struct inode *inode) +{ + struct fsverity_info *vi = fsverity_get_info(inode); + struct fsverity_descriptor *desc; + size_t desc_size; + int err; + + if (vi) + return 0; + + err = fsverity_get_descriptor(inode, &desc, &desc_size); + if (err) + return err; + + vi = fsverity_create_info(inode, desc, desc_size); if (IS_ERR(vi)) { - res = PTR_ERR(vi); + err = PTR_ERR(vi); goto out_free_desc; } fsverity_set_info(inode, vi); - res = 0; + err = 0; out_free_desc: kfree(desc); - return res; + return err; } /** From b8dc76de9074256463e4c5f64e8da5e68d174366 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 15 Jan 2021 10:18:15 -0800 Subject: [PATCH 183/580] fs-verity: don't pass whole descriptor to fsverity_verify_signature() Now that fsverity_get_descriptor() validates the sig_size field, fsverity_verify_signature() doesn't need to do it. Just change the prototype of fsverity_verify_signature() to take the signature directly rather than take a fsverity_descriptor. Link: https://lore.kernel.org/r/20210115181819.34732-3-ebiggers@kernel.org Reviewed-by: Victor Hsieh Reviewed-by: Jaegeuk Kim Reviewed-by: Amy Parker Reviewed-by: Chao Yu Signed-off-by: Eric Biggers --- fs/verity/fsverity_private.h | 6 ++---- fs/verity/open.c | 3 ++- fs/verity/signature.c | 20 ++++++-------------- 3 files changed, 10 insertions(+), 19 deletions(-) diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index a647b761750a..a275cad1548a 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -140,15 +140,13 @@ void __init fsverity_exit_info_cache(void); #ifdef CONFIG_FS_VERITY_BUILTIN_SIGNATURES int fsverity_verify_signature(const struct fsverity_info *vi, - const struct fsverity_descriptor *desc, - size_t desc_size); + const u8 *signature, size_t sig_size); int __init fsverity_init_signature(void); #else /* !CONFIG_FS_VERITY_BUILTIN_SIGNATURES */ static inline int fsverity_verify_signature(const struct fsverity_info *vi, - const struct fsverity_descriptor *desc, - size_t desc_size) + const u8 *signature, size_t sig_size) { return 0; } diff --git a/fs/verity/open.c b/fs/verity/open.c index a987bb785e9b..60ff8af7219f 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -181,7 +181,8 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode, vi->tree_params.hash_alg->name, vi->tree_params.digest_size, vi->file_digest); - err = fsverity_verify_signature(vi, desc, desc_size); + err = fsverity_verify_signature(vi, desc->signature, + le32_to_cpu(desc->sig_size)); out: if (err) { fsverity_free_info(vi); diff --git a/fs/verity/signature.c b/fs/verity/signature.c index a8591fbe6fe5..b42f6e832825 100644 --- a/fs/verity/signature.c +++ b/fs/verity/signature.c @@ -29,21 +29,19 @@ static struct key *fsverity_keyring; /** * fsverity_verify_signature() - check a verity file's signature * @vi: the file's fsverity_info - * @desc: the file's fsverity_descriptor - * @desc_size: size of @desc + * @signature: the file's built-in signature + * @sig_size: size of signature in bytes, or 0 if no signature * - * If the file's fs-verity descriptor includes a signature of the file digest, - * verify it against the certificates in the fs-verity keyring. + * If the file includes a signature of its fs-verity file digest, verify it + * against the certificates in the fs-verity keyring. * * Return: 0 on success (signature valid or not required); -errno on failure */ int fsverity_verify_signature(const struct fsverity_info *vi, - const struct fsverity_descriptor *desc, - size_t desc_size) + const u8 *signature, size_t sig_size) { const struct inode *inode = vi->inode; const struct fsverity_hash_alg *hash_alg = vi->tree_params.hash_alg; - const u32 sig_size = le32_to_cpu(desc->sig_size); struct fsverity_formatted_digest *d; int err; @@ -56,11 +54,6 @@ int fsverity_verify_signature(const struct fsverity_info *vi, return 0; } - if (sig_size > desc_size - sizeof(*desc)) { - fsverity_err(inode, "Signature overflows verity descriptor"); - return -EBADMSG; - } - d = kzalloc(sizeof(*d) + hash_alg->digest_size, GFP_KERNEL); if (!d) return -ENOMEM; @@ -70,8 +63,7 @@ int fsverity_verify_signature(const struct fsverity_info *vi, memcpy(d->digest, vi->file_digest, hash_alg->digest_size); err = verify_pkcs7_signature(d, sizeof(*d) + hash_alg->digest_size, - desc->signature, sig_size, - fsverity_keyring, + signature, sig_size, fsverity_keyring, VERIFYING_UNSPECIFIED_SIGNATURE, NULL, NULL); kfree(d); From aad86a03651462022be1d8a1589f538af149aac6 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 15 Jan 2021 10:18:16 -0800 Subject: [PATCH 184/580] fs-verity: add FS_IOC_READ_VERITY_METADATA ioctl Add an ioctl FS_IOC_READ_VERITY_METADATA which will allow reading verity metadata from a file that has fs-verity enabled, including: - The Merkle tree - The fsverity_descriptor (not including the signature if present) - The built-in signature, if present This ioctl has similar semantics to pread(). It is passed the type of metadata to read (one of the above three), and a buffer, offset, and size. It returns the number of bytes read or an error. Separate patches will add support for each of the above metadata types. This patch just adds the ioctl itself. This ioctl doesn't make any assumption about where the metadata is stored on-disk. It does assume the metadata is in a stable format, but that's basically already the case: - The Merkle tree and fsverity_descriptor are defined by how fs-verity file digests are computed; see the "File digest computation" section of Documentation/filesystems/fsverity.rst. Technically, the way in which the levels of the tree are ordered relative to each other wasn't previously specified, but it's logical to put the root level first. - The built-in signature is the value passed to FS_IOC_ENABLE_VERITY. This ioctl is useful because it allows writing a server program that takes a verity file and serves it to a client program, such that the client can do its own fs-verity compatible verification of the file. This only makes sense if the client doesn't trust the server and if the server needs to provide the storage for the client. More concretely, there is interest in using this ability in Android to export APK files (which are protected by fs-verity) to "protected VMs". This would use Protected KVM (https://lwn.net/Articles/836693), which provides an isolated execution environment without having to trust the traditional "host". A "guest" VM can boot from a signed image and perform specific tasks in a minimum trusted environment using files that have fs-verity enabled on the host, without trusting the host or requiring that the guest has its own trusted storage. Technically, it would be possible to duplicate the metadata and store it in separate files for serving. However, that would be less efficient and would require extra care in userspace to maintain file consistency. In addition to the above, the ability to read the built-in signatures is useful because it allows a system that is using the in-kernel signature verification to migrate to userspace signature verification. Link: https://lore.kernel.org/r/20210115181819.34732-4-ebiggers@kernel.org Reviewed-by: Victor Hsieh Acked-by: Jaegeuk Kim Reviewed-by: Chao Yu Signed-off-by: Eric Biggers --- Documentation/filesystems/fsverity.rst | 57 ++++++++++++++++++++++++++ fs/ext4/ioctl.c | 7 ++++ fs/f2fs/file.c | 11 +++++ fs/verity/Makefile | 1 + fs/verity/read_metadata.c | 55 +++++++++++++++++++++++++ include/linux/fsverity.h | 12 ++++++ include/uapi/linux/fsverity.h | 10 +++++ 7 files changed, 153 insertions(+) create mode 100644 fs/verity/read_metadata.c diff --git a/Documentation/filesystems/fsverity.rst b/Documentation/filesystems/fsverity.rst index 28ec7ba5b6df..a9a7663ec980 100644 --- a/Documentation/filesystems/fsverity.rst +++ b/Documentation/filesystems/fsverity.rst @@ -217,6 +217,63 @@ FS_IOC_MEASURE_VERITY can fail with the following errors: - ``EOVERFLOW``: the digest is longer than the specified ``digest_size`` bytes. Try providing a larger buffer. +FS_IOC_READ_VERITY_METADATA +--------------------------- + +The FS_IOC_READ_VERITY_METADATA ioctl reads verity metadata from a +verity file. This ioctl is available since Linux v5.12. + +This ioctl allows writing a server program that takes a verity file +and serves it to a client program, such that the client can do its own +fs-verity compatible verification of the file. This only makes sense +if the client doesn't trust the server and if the server needs to +provide the storage for the client. + +This is a fairly specialized use case, and most fs-verity users won't +need this ioctl. + +This ioctl takes in a pointer to the following structure:: + + struct fsverity_read_metadata_arg { + __u64 metadata_type; + __u64 offset; + __u64 length; + __u64 buf_ptr; + __u64 __reserved; + }; + +``metadata_type`` specifies the type of metadata to read. + +The semantics are similar to those of ``pread()``. ``offset`` +specifies the offset in bytes into the metadata item to read from, and +``length`` specifies the maximum number of bytes to read from the +metadata item. ``buf_ptr`` is the pointer to the buffer to read into, +cast to a 64-bit integer. ``__reserved`` must be 0. On success, the +number of bytes read is returned. 0 is returned at the end of the +metadata item. The returned length may be less than ``length``, for +example if the ioctl is interrupted. + +The metadata returned by FS_IOC_READ_VERITY_METADATA isn't guaranteed +to be authenticated against the file digest that would be returned by +`FS_IOC_MEASURE_VERITY`_, as the metadata is expected to be used to +implement fs-verity compatible verification anyway (though absent a +malicious disk, the metadata will indeed match). E.g. to implement +this ioctl, the filesystem is allowed to just read the Merkle tree +blocks from disk without actually verifying the path to the root node. + +FS_IOC_READ_VERITY_METADATA can fail with the following errors: + +- ``EFAULT``: the caller provided inaccessible memory +- ``EINTR``: the ioctl was interrupted before any data was read +- ``EINVAL``: reserved fields were set, or ``offset + length`` + overflowed +- ``ENODATA``: the file is not a verity file +- ``ENOTTY``: this type of filesystem does not implement fs-verity, or + this ioctl is not yet implemented on it +- ``EOPNOTSUPP``: the kernel was not configured with fs-verity + support, or the filesystem superblock has not had the 'verity' + feature enabled on it. (See `Filesystem support`_.) + FS_IOC_GETFLAGS --------------- diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 5214b4a93be9..1cad30aeb51b 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1102,6 +1102,12 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return -EOPNOTSUPP; return fsverity_ioctl_measure(filp, (void __user *)arg); + case FS_IOC_READ_VERITY_METADATA: + if (!ext4_has_feature_verity(sb)) + return -EOPNOTSUPP; + return fsverity_ioctl_read_metadata(filp, + (const void __user *)arg); + default: return -ENOTTY; } @@ -1172,6 +1178,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case FS_IOC_GETFSMAP: case FS_IOC_ENABLE_VERITY: case FS_IOC_MEASURE_VERITY: + case FS_IOC_READ_VERITY_METADATA: break; default: return -ENOIOCTLCMD; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7726519d6168..ae9bc814fc11 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -3400,6 +3400,14 @@ static int f2fs_ioc_measure_verity(struct file *filp, unsigned long arg) return fsverity_ioctl_measure(filp, (void __user *)arg); } +static int f2fs_ioc_read_verity_metadata(struct file *filp, unsigned long arg) +{ + if (!f2fs_sb_has_verity(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fsverity_ioctl_read_metadata(filp, (const void __user *)arg); +} + static int f2fs_ioc_getfslabel(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -4314,6 +4322,8 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_enable_verity(filp, arg); case FS_IOC_MEASURE_VERITY: return f2fs_ioc_measure_verity(filp, arg); + case FS_IOC_READ_VERITY_METADATA: + return f2fs_ioc_read_verity_metadata(filp, arg); case FS_IOC_GETFSLABEL: return f2fs_ioc_getfslabel(filp, arg); case FS_IOC_SETFSLABEL: @@ -4570,6 +4580,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_RESIZE_FS: case FS_IOC_ENABLE_VERITY: case FS_IOC_MEASURE_VERITY: + case FS_IOC_READ_VERITY_METADATA: case FS_IOC_GETFSLABEL: case FS_IOC_SETFSLABEL: case F2FS_IOC_GET_COMPRESS_BLOCKS: diff --git a/fs/verity/Makefile b/fs/verity/Makefile index 570e9136334d..435559a4fa9e 100644 --- a/fs/verity/Makefile +++ b/fs/verity/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_FS_VERITY) += enable.o \ init.o \ measure.o \ open.o \ + read_metadata.o \ verify.o obj-$(CONFIG_FS_VERITY_BUILTIN_SIGNATURES) += signature.o diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c new file mode 100644 index 000000000000..43be990fd53e --- /dev/null +++ b/fs/verity/read_metadata.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Ioctl to read verity metadata + * + * Copyright 2021 Google LLC + */ + +#include "fsverity_private.h" + +#include + +/** + * fsverity_ioctl_read_metadata() - read verity metadata from a file + * @filp: file to read the metadata from + * @uarg: user pointer to fsverity_read_metadata_arg + * + * Return: length read on success, 0 on EOF, -errno on failure + */ +int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg) +{ + struct inode *inode = file_inode(filp); + const struct fsverity_info *vi; + struct fsverity_read_metadata_arg arg; + int length; + void __user *buf; + + vi = fsverity_get_info(inode); + if (!vi) + return -ENODATA; /* not a verity file */ + /* + * Note that we don't have to explicitly check that the file is open for + * reading, since verity files can only be opened for reading. + */ + + if (copy_from_user(&arg, uarg, sizeof(arg))) + return -EFAULT; + + if (arg.__reserved) + return -EINVAL; + + /* offset + length must not overflow. */ + if (arg.offset + arg.length < arg.offset) + return -EINVAL; + + /* Ensure that the return value will fit in INT_MAX. */ + length = min_t(u64, arg.length, INT_MAX); + + buf = u64_to_user_ptr(arg.buf_ptr); + + switch (arg.metadata_type) { + default: + return -EINVAL; + } +} +EXPORT_SYMBOL_GPL(fsverity_ioctl_read_metadata); diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h index c1144a450392..b568b3c7d095 100644 --- a/include/linux/fsverity.h +++ b/include/linux/fsverity.h @@ -138,6 +138,10 @@ int fsverity_file_open(struct inode *inode, struct file *filp); int fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr); void fsverity_cleanup_inode(struct inode *inode); +/* read_metadata.c */ + +int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg); + /* verify.c */ bool fsverity_verify_page(struct page *page); @@ -183,6 +187,14 @@ static inline void fsverity_cleanup_inode(struct inode *inode) { } +/* read_metadata.c */ + +static inline int fsverity_ioctl_read_metadata(struct file *filp, + const void __user *uarg) +{ + return -EOPNOTSUPP; +} + /* verify.c */ static inline bool fsverity_verify_page(struct page *page) diff --git a/include/uapi/linux/fsverity.h b/include/uapi/linux/fsverity.h index 33f44156f8ea..e062751294d0 100644 --- a/include/uapi/linux/fsverity.h +++ b/include/uapi/linux/fsverity.h @@ -83,7 +83,17 @@ struct fsverity_formatted_digest { __u8 digest[]; }; +struct fsverity_read_metadata_arg { + __u64 metadata_type; + __u64 offset; + __u64 length; + __u64 buf_ptr; + __u64 __reserved; +}; + #define FS_IOC_ENABLE_VERITY _IOW('f', 133, struct fsverity_enable_arg) #define FS_IOC_MEASURE_VERITY _IOWR('f', 134, struct fsverity_digest) +#define FS_IOC_READ_VERITY_METADATA \ + _IOWR('f', 135, struct fsverity_read_metadata_arg) #endif /* _UAPI_LINUX_FSVERITY_H */ From 11fa0d75de393ee16a1d2d3e6fd63eed8d015373 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 15 Jan 2021 10:18:17 -0800 Subject: [PATCH 185/580] fs-verity: support reading Merkle tree with ioctl Add support for FS_VERITY_METADATA_TYPE_MERKLE_TREE to FS_IOC_READ_VERITY_METADATA. This allows a userspace server program to retrieve the Merkle tree of a verity file for serving to a client which implements fs-verity compatible verification. See the patch which introduced FS_IOC_READ_VERITY_METADATA for more details. This has been tested using a new xfstest which calls this ioctl via a new subcommand for the 'fsverity' program from fsverity-utils. Link: https://lore.kernel.org/r/20210115181819.34732-5-ebiggers@kernel.org Reviewed-by: Victor Hsieh Reviewed-by: Jaegeuk Kim Reviewed-by: Chao Yu Signed-off-by: Eric Biggers --- Documentation/filesystems/fsverity.rst | 10 +++- fs/verity/read_metadata.c | 70 ++++++++++++++++++++++++++ include/uapi/linux/fsverity.h | 2 + 3 files changed, 81 insertions(+), 1 deletion(-) diff --git a/Documentation/filesystems/fsverity.rst b/Documentation/filesystems/fsverity.rst index a9a7663ec980..e698a4383eb0 100644 --- a/Documentation/filesystems/fsverity.rst +++ b/Documentation/filesystems/fsverity.rst @@ -234,6 +234,8 @@ need this ioctl. This ioctl takes in a pointer to the following structure:: + #define FS_VERITY_METADATA_TYPE_MERKLE_TREE 1 + struct fsverity_read_metadata_arg { __u64 metadata_type; __u64 offset; @@ -242,7 +244,13 @@ This ioctl takes in a pointer to the following structure:: __u64 __reserved; }; -``metadata_type`` specifies the type of metadata to read. +``metadata_type`` specifies the type of metadata to read: + +- ``FS_VERITY_METADATA_TYPE_MERKLE_TREE`` reads the blocks of the + Merkle tree. The blocks are returned in order from the root level + to the leaf level. Within each level, the blocks are returned in + the same order that their hashes are themselves hashed. + See `Merkle tree`_ for more information. The semantics are similar to those of ``pread()``. ``offset`` specifies the offset in bytes into the metadata item to read from, and diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c index 43be990fd53e..0f8ad2991cf9 100644 --- a/fs/verity/read_metadata.c +++ b/fs/verity/read_metadata.c @@ -7,8 +7,75 @@ #include "fsverity_private.h" +#include +#include +#include #include +static int fsverity_read_merkle_tree(struct inode *inode, + const struct fsverity_info *vi, + void __user *buf, u64 offset, int length) +{ + const struct fsverity_operations *vops = inode->i_sb->s_vop; + u64 end_offset; + unsigned int offs_in_page; + pgoff_t index, last_index; + int retval = 0; + int err = 0; + + end_offset = min(offset + length, vi->tree_params.tree_size); + if (offset >= end_offset) + return 0; + offs_in_page = offset_in_page(offset); + last_index = (end_offset - 1) >> PAGE_SHIFT; + + /* + * Iterate through each Merkle tree page in the requested range and copy + * the requested portion to userspace. Note that the Merkle tree block + * size isn't important here, as we are returning a byte stream; i.e., + * we can just work with pages even if the tree block size != PAGE_SIZE. + */ + for (index = offset >> PAGE_SHIFT; index <= last_index; index++) { + unsigned long num_ra_pages = + min_t(unsigned long, last_index - index + 1, + inode->i_sb->s_bdi->io_pages); + unsigned int bytes_to_copy = min_t(u64, end_offset - offset, + PAGE_SIZE - offs_in_page); + struct page *page; + const void *virt; + + page = vops->read_merkle_tree_page(inode, index, num_ra_pages); + if (IS_ERR(page)) { + err = PTR_ERR(page); + fsverity_err(inode, + "Error %d reading Merkle tree page %lu", + err, index); + break; + } + + virt = kmap(page); + if (copy_to_user(buf, virt + offs_in_page, bytes_to_copy)) { + kunmap(page); + put_page(page); + err = -EFAULT; + break; + } + kunmap(page); + put_page(page); + + retval += bytes_to_copy; + buf += bytes_to_copy; + offset += bytes_to_copy; + + if (fatal_signal_pending(current)) { + err = -EINTR; + break; + } + cond_resched(); + offs_in_page = 0; + } + return retval ? retval : err; +} /** * fsverity_ioctl_read_metadata() - read verity metadata from a file * @filp: file to read the metadata from @@ -48,6 +115,9 @@ int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg) buf = u64_to_user_ptr(arg.buf_ptr); switch (arg.metadata_type) { + case FS_VERITY_METADATA_TYPE_MERKLE_TREE: + return fsverity_read_merkle_tree(inode, vi, buf, arg.offset, + length); default: return -EINVAL; } diff --git a/include/uapi/linux/fsverity.h b/include/uapi/linux/fsverity.h index e062751294d0..94003b153cb3 100644 --- a/include/uapi/linux/fsverity.h +++ b/include/uapi/linux/fsverity.h @@ -83,6 +83,8 @@ struct fsverity_formatted_digest { __u8 digest[]; }; +#define FS_VERITY_METADATA_TYPE_MERKLE_TREE 1 + struct fsverity_read_metadata_arg { __u64 metadata_type; __u64 offset; From 67c2d4aa2ed77afb08a38e2d4bd530ec1168bd6a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 15 Jan 2021 10:18:18 -0800 Subject: [PATCH 186/580] fs-verity: support reading descriptor with ioctl Add support for FS_VERITY_METADATA_TYPE_DESCRIPTOR to FS_IOC_READ_VERITY_METADATA. This allows a userspace server program to retrieve the fs-verity descriptor of a file for serving to a client which implements fs-verity compatible verification. See the patch which introduced FS_IOC_READ_VERITY_METADATA for more details. "fs-verity descriptor" here means only the part that userspace cares about because it is hashed to produce the file digest. It doesn't include the signature which ext4 and f2fs append to the fsverity_descriptor struct when storing it on-disk, since that way of storing the signature is an implementation detail. The next patch adds a separate metadata_type value for retrieving the signature separately. This has been tested using a new xfstest which calls this ioctl via a new subcommand for the 'fsverity' program from fsverity-utils. Link: https://lore.kernel.org/r/20210115181819.34732-6-ebiggers@kernel.org Reviewed-by: Victor Hsieh Reviewed-by: Jaegeuk Kim Reviewed-by: Chao Yu Signed-off-by: Eric Biggers --- Documentation/filesystems/fsverity.rst | 4 +++ fs/verity/read_metadata.c | 40 ++++++++++++++++++++++++++ include/uapi/linux/fsverity.h | 1 + 3 files changed, 45 insertions(+) diff --git a/Documentation/filesystems/fsverity.rst b/Documentation/filesystems/fsverity.rst index e698a4383eb0..633166c84c80 100644 --- a/Documentation/filesystems/fsverity.rst +++ b/Documentation/filesystems/fsverity.rst @@ -235,6 +235,7 @@ need this ioctl. This ioctl takes in a pointer to the following structure:: #define FS_VERITY_METADATA_TYPE_MERKLE_TREE 1 + #define FS_VERITY_METADATA_TYPE_DESCRIPTOR 2 struct fsverity_read_metadata_arg { __u64 metadata_type; @@ -252,6 +253,9 @@ This ioctl takes in a pointer to the following structure:: the same order that their hashes are themselves hashed. See `Merkle tree`_ for more information. +- ``FS_VERITY_METADATA_TYPE_DESCRIPTOR`` reads the fs-verity + descriptor. See `fs-verity descriptor`_. + The semantics are similar to those of ``pread()``. ``offset`` specifies the offset in bytes into the metadata item to read from, and ``length`` specifies the maximum number of bytes to read from the diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c index 0f8ad2991cf9..2dea6dd3bb05 100644 --- a/fs/verity/read_metadata.c +++ b/fs/verity/read_metadata.c @@ -76,6 +76,44 @@ static int fsverity_read_merkle_tree(struct inode *inode, } return retval ? retval : err; } + +/* Copy the requested portion of the buffer to userspace. */ +static int fsverity_read_buffer(void __user *dst, u64 offset, int length, + const void *src, size_t src_length) +{ + if (offset >= src_length) + return 0; + src += offset; + src_length -= offset; + + length = min_t(size_t, length, src_length); + + if (copy_to_user(dst, src, length)) + return -EFAULT; + + return length; +} + +static int fsverity_read_descriptor(struct inode *inode, + void __user *buf, u64 offset, int length) +{ + struct fsverity_descriptor *desc; + size_t desc_size; + int res; + + res = fsverity_get_descriptor(inode, &desc, &desc_size); + if (res) + return res; + + /* don't include the signature */ + desc_size = offsetof(struct fsverity_descriptor, signature); + desc->sig_size = 0; + + res = fsverity_read_buffer(buf, offset, length, desc, desc_size); + + kfree(desc); + return res; +} /** * fsverity_ioctl_read_metadata() - read verity metadata from a file * @filp: file to read the metadata from @@ -118,6 +156,8 @@ int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg) case FS_VERITY_METADATA_TYPE_MERKLE_TREE: return fsverity_read_merkle_tree(inode, vi, buf, arg.offset, length); + case FS_VERITY_METADATA_TYPE_DESCRIPTOR: + return fsverity_read_descriptor(inode, buf, arg.offset, length); default: return -EINVAL; } diff --git a/include/uapi/linux/fsverity.h b/include/uapi/linux/fsverity.h index 94003b153cb3..41abc283dbcc 100644 --- a/include/uapi/linux/fsverity.h +++ b/include/uapi/linux/fsverity.h @@ -84,6 +84,7 @@ struct fsverity_formatted_digest { }; #define FS_VERITY_METADATA_TYPE_MERKLE_TREE 1 +#define FS_VERITY_METADATA_TYPE_DESCRIPTOR 2 struct fsverity_read_metadata_arg { __u64 metadata_type; From 5f4d2c15ba07a95486757b027347a87f7b95a206 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 15 Jan 2021 10:18:19 -0800 Subject: [PATCH 187/580] fs-verity: support reading signature with ioctl Add support for FS_VERITY_METADATA_TYPE_SIGNATURE to FS_IOC_READ_VERITY_METADATA. This allows a userspace server program to retrieve the built-in signature (if present) of a verity file for serving to a client which implements fs-verity compatible verification. See the patch which introduced FS_IOC_READ_VERITY_METADATA for more details. The ability for userspace to read the built-in signatures is also useful because it allows a system that is using the in-kernel signature verification to migrate to userspace signature verification. This has been tested using a new xfstest which calls this ioctl via a new subcommand for the 'fsverity' program from fsverity-utils. Link: https://lore.kernel.org/r/20210115181819.34732-7-ebiggers@kernel.org Reviewed-by: Victor Hsieh Reviewed-by: Jaegeuk Kim Reviewed-by: Chao Yu Signed-off-by: Eric Biggers --- Documentation/filesystems/fsverity.rst | 9 +++++++- fs/verity/read_metadata.c | 30 ++++++++++++++++++++++++++ include/uapi/linux/fsverity.h | 1 + 3 files changed, 39 insertions(+), 1 deletion(-) diff --git a/Documentation/filesystems/fsverity.rst b/Documentation/filesystems/fsverity.rst index 633166c84c80..8e73ac0c6ae6 100644 --- a/Documentation/filesystems/fsverity.rst +++ b/Documentation/filesystems/fsverity.rst @@ -236,6 +236,7 @@ This ioctl takes in a pointer to the following structure:: #define FS_VERITY_METADATA_TYPE_MERKLE_TREE 1 #define FS_VERITY_METADATA_TYPE_DESCRIPTOR 2 + #define FS_VERITY_METADATA_TYPE_SIGNATURE 3 struct fsverity_read_metadata_arg { __u64 metadata_type; @@ -256,6 +257,10 @@ This ioctl takes in a pointer to the following structure:: - ``FS_VERITY_METADATA_TYPE_DESCRIPTOR`` reads the fs-verity descriptor. See `fs-verity descriptor`_. +- ``FS_VERITY_METADATA_TYPE_SIGNATURE`` reads the signature which was + passed to FS_IOC_ENABLE_VERITY, if any. See `Built-in signature + verification`_. + The semantics are similar to those of ``pread()``. ``offset`` specifies the offset in bytes into the metadata item to read from, and ``length`` specifies the maximum number of bytes to read from the @@ -279,7 +284,9 @@ FS_IOC_READ_VERITY_METADATA can fail with the following errors: - ``EINTR``: the ioctl was interrupted before any data was read - ``EINVAL``: reserved fields were set, or ``offset + length`` overflowed -- ``ENODATA``: the file is not a verity file +- ``ENODATA``: the file is not a verity file, or + FS_VERITY_METADATA_TYPE_SIGNATURE was requested but the file doesn't + have a built-in signature - ``ENOTTY``: this type of filesystem does not implement fs-verity, or this ioctl is not yet implemented on it - ``EOPNOTSUPP``: the kernel was not configured with fs-verity diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c index 2dea6dd3bb05..7e2d0c7bdf0d 100644 --- a/fs/verity/read_metadata.c +++ b/fs/verity/read_metadata.c @@ -114,6 +114,34 @@ static int fsverity_read_descriptor(struct inode *inode, kfree(desc); return res; } + +static int fsverity_read_signature(struct inode *inode, + void __user *buf, u64 offset, int length) +{ + struct fsverity_descriptor *desc; + size_t desc_size; + int res; + + res = fsverity_get_descriptor(inode, &desc, &desc_size); + if (res) + return res; + + if (desc->sig_size == 0) { + res = -ENODATA; + goto out; + } + + /* + * Include only the signature. Note that fsverity_get_descriptor() + * already verified that sig_size is in-bounds. + */ + res = fsverity_read_buffer(buf, offset, length, desc->signature, + le32_to_cpu(desc->sig_size)); +out: + kfree(desc); + return res; +} + /** * fsverity_ioctl_read_metadata() - read verity metadata from a file * @filp: file to read the metadata from @@ -158,6 +186,8 @@ int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg) length); case FS_VERITY_METADATA_TYPE_DESCRIPTOR: return fsverity_read_descriptor(inode, buf, arg.offset, length); + case FS_VERITY_METADATA_TYPE_SIGNATURE: + return fsverity_read_signature(inode, buf, arg.offset, length); default: return -EINVAL; } diff --git a/include/uapi/linux/fsverity.h b/include/uapi/linux/fsverity.h index 41abc283dbcc..15384e22e331 100644 --- a/include/uapi/linux/fsverity.h +++ b/include/uapi/linux/fsverity.h @@ -85,6 +85,7 @@ struct fsverity_formatted_digest { #define FS_VERITY_METADATA_TYPE_MERKLE_TREE 1 #define FS_VERITY_METADATA_TYPE_DESCRIPTOR 2 +#define FS_VERITY_METADATA_TYPE_SIGNATURE 3 struct fsverity_read_metadata_arg { __u64 metadata_type; From c8203f86b66311d78429c43c040abc9c64249a7a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 12 Apr 2021 11:22:16 -0700 Subject: [PATCH 188/580] f2fs: remove obsolete f2fs.txt This removes stale f2fs.txt. Reported-by: Neptune Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 734 ----------------------------- 1 file changed, 734 deletions(-) delete mode 100644 Documentation/filesystems/f2fs.txt diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt deleted file mode 100644 index bc17e0192bbe..000000000000 --- a/Documentation/filesystems/f2fs.txt +++ /dev/null @@ -1,734 +0,0 @@ -================================================================================ -WHAT IS Flash-Friendly File System (F2FS)? -================================================================================ - -NAND flash memory-based storage devices, such as SSD, eMMC, and SD cards, have -been equipped on a variety systems ranging from mobile to server systems. Since -they are known to have different characteristics from the conventional rotating -disks, a file system, an upper layer to the storage device, should adapt to the -changes from the sketch in the design level. - -F2FS is a file system exploiting NAND flash memory-based storage devices, which -is based on Log-structured File System (LFS). The design has been focused on -addressing the fundamental issues in LFS, which are snowball effect of wandering -tree and high cleaning overhead. - -Since a NAND flash memory-based storage device shows different characteristic -according to its internal geometry or flash memory management scheme, namely FTL, -F2FS and its tools support various parameters not only for configuring on-disk -layout, but also for selecting allocation and cleaning algorithms. - -The following git tree provides the file system formatting tool (mkfs.f2fs), -a consistency checking tool (fsck.f2fs), and a debugging tool (dump.f2fs). ->> git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs-tools.git - -For reporting bugs and sending patches, please use the following mailing list: ->> linux-f2fs-devel@lists.sourceforge.net - -================================================================================ -BACKGROUND AND DESIGN ISSUES -================================================================================ - -Log-structured File System (LFS) --------------------------------- -"A log-structured file system writes all modifications to disk sequentially in -a log-like structure, thereby speeding up both file writing and crash recovery. -The log is the only structure on disk; it contains indexing information so that -files can be read back from the log efficiently. In order to maintain large free -areas on disk for fast writing, we divide the log into segments and use a -segment cleaner to compress the live information from heavily fragmented -segments." from Rosenblum, M. and Ousterhout, J. K., 1992, "The design and -implementation of a log-structured file system", ACM Trans. Computer Systems -10, 1, 26–52. - -Wandering Tree Problem ----------------------- -In LFS, when a file data is updated and written to the end of log, its direct -pointer block is updated due to the changed location. Then the indirect pointer -block is also updated due to the direct pointer block update. In this manner, -the upper index structures such as inode, inode map, and checkpoint block are -also updated recursively. This problem is called as wandering tree problem [1], -and in order to enhance the performance, it should eliminate or relax the update -propagation as much as possible. - -[1] Bityutskiy, A. 2005. JFFS3 design issues. http://www.linux-mtd.infradead.org/ - -Cleaning Overhead ------------------ -Since LFS is based on out-of-place writes, it produces so many obsolete blocks -scattered across the whole storage. In order to serve new empty log space, it -needs to reclaim these obsolete blocks seamlessly to users. This job is called -as a cleaning process. - -The process consists of three operations as follows. -1. A victim segment is selected through referencing segment usage table. -2. It loads parent index structures of all the data in the victim identified by - segment summary blocks. -3. It checks the cross-reference between the data and its parent index structure. -4. It moves valid data selectively. - -This cleaning job may cause unexpected long delays, so the most important goal -is to hide the latencies to users. And also definitely, it should reduce the -amount of valid data to be moved, and move them quickly as well. - -================================================================================ -KEY FEATURES -================================================================================ - -Flash Awareness ---------------- -- Enlarge the random write area for better performance, but provide the high - spatial locality -- Align FS data structures to the operational units in FTL as best efforts - -Wandering Tree Problem ----------------------- -- Use a term, “node”, that represents inodes as well as various pointer blocks -- Introduce Node Address Table (NAT) containing the locations of all the “node” - blocks; this will cut off the update propagation. - -Cleaning Overhead ------------------ -- Support a background cleaning process -- Support greedy and cost-benefit algorithms for victim selection policies -- Support multi-head logs for static/dynamic hot and cold data separation -- Introduce adaptive logging for efficient block allocation - -================================================================================ -MOUNT OPTIONS -================================================================================ - -background_gc=%s Turn on/off cleaning operations, namely garbage - collection, triggered in background when I/O subsystem is - idle. If background_gc=on, it will turn on the garbage - collection and if background_gc=off, garbage collection - will be turned off. If background_gc=sync, it will turn - on synchronous garbage collection running in background. - Default value for this option is on. So garbage - collection is on by default. -disable_roll_forward Disable the roll-forward recovery routine -norecovery Disable the roll-forward recovery routine, mounted read- - only (i.e., -o ro,disable_roll_forward) -discard/nodiscard Enable/disable real-time discard in f2fs, if discard is - enabled, f2fs will issue discard/TRIM commands when a - segment is cleaned. -no_heap Disable heap-style segment allocation which finds free - segments for data from the beginning of main area, while - for node from the end of main area. -nouser_xattr Disable Extended User Attributes. Note: xattr is enabled - by default if CONFIG_F2FS_FS_XATTR is selected. -noacl Disable POSIX Access Control List. Note: acl is enabled - by default if CONFIG_F2FS_FS_POSIX_ACL is selected. -active_logs=%u Support configuring the number of active logs. In the - current design, f2fs supports only 2, 4, and 6 logs. - Default number is 6. -disable_ext_identify Disable the extension list configured by mkfs, so f2fs - does not aware of cold files such as media files. -inline_xattr Enable the inline xattrs feature. -noinline_xattr Disable the inline xattrs feature. -inline_xattr_size=%u Support configuring inline xattr size, it depends on - flexible inline xattr feature. -inline_data Enable the inline data feature: New created small(<~3.4k) - files can be written into inode block. -inline_dentry Enable the inline dir feature: data in new created - directory entries can be written into inode block. The - space of inode block which is used to store inline - dentries is limited to ~3.4k. -noinline_dentry Disable the inline dentry feature. -flush_merge Merge concurrent cache_flush commands as much as possible - to eliminate redundant command issues. If the underlying - device handles the cache_flush command relatively slowly, - recommend to enable this option. -nobarrier This option can be used if underlying storage guarantees - its cached data should be written to the novolatile area. - If this option is set, no cache_flush commands are issued - but f2fs still guarantees the write ordering of all the - data writes. -fastboot This option is used when a system wants to reduce mount - time as much as possible, even though normal performance - can be sacrificed. -extent_cache Enable an extent cache based on rb-tree, it can cache - as many as extent which map between contiguous logical - address and physical address per inode, resulting in - increasing the cache hit ratio. Set by default. -noextent_cache Disable an extent cache based on rb-tree explicitly, see - the above extent_cache mount option. -noinline_data Disable the inline data feature, inline data feature is - enabled by default. -data_flush Enable data flushing before checkpoint in order to - persist data of regular and symlink. -reserve_root=%d Support configuring reserved space which is used for - allocation from a privileged user with specified uid or - gid, unit: 4KB, the default limit is 0.2% of user blocks. -resuid=%d The user ID which may use the reserved blocks. -resgid=%d The group ID which may use the reserved blocks. -fault_injection=%d Enable fault injection in all supported types with - specified injection rate. -fault_type=%d Support configuring fault injection type, should be - enabled with fault_injection option, fault type value - is shown below, it supports single or combined type. - Type_Name Type_Value - FAULT_KMALLOC 0x000000001 - FAULT_KVMALLOC 0x000000002 - FAULT_PAGE_ALLOC 0x000000004 - FAULT_PAGE_GET 0x000000008 - FAULT_ALLOC_BIO 0x000000010 - FAULT_ALLOC_NID 0x000000020 - FAULT_ORPHAN 0x000000040 - FAULT_BLOCK 0x000000080 - FAULT_DIR_DEPTH 0x000000100 - FAULT_EVICT_INODE 0x000000200 - FAULT_TRUNCATE 0x000000400 - FAULT_READ_IO 0x000000800 - FAULT_CHECKPOINT 0x000001000 - FAULT_DISCARD 0x000002000 - FAULT_WRITE_IO 0x000004000 -mode=%s Control block allocation mode which supports "adaptive" - and "lfs". In "lfs" mode, there should be no random - writes towards main area. -io_bits=%u Set the bit size of write IO requests. It should be set - with "mode=lfs". -usrquota Enable plain user disk quota accounting. -grpquota Enable plain group disk quota accounting. -prjquota Enable plain project quota accounting. -usrjquota= Appoint specified file and type during mount, so that quota -grpjquota= information can be properly updated during recovery flow, -prjjquota= : must be in root directory; -jqfmt= : [vfsold,vfsv0,vfsv1]. -offusrjquota Turn off user journelled quota. -offgrpjquota Turn off group journelled quota. -offprjjquota Turn off project journelled quota. -quota Enable plain user disk quota accounting. -noquota Disable all plain disk quota option. -whint_mode=%s Control which write hints are passed down to block - layer. This supports "off", "user-based", and - "fs-based". In "off" mode (default), f2fs does not pass - down hints. In "user-based" mode, f2fs tries to pass - down hints given by users. And in "fs-based" mode, f2fs - passes down hints with its policy. -alloc_mode=%s Adjust block allocation policy, which supports "reuse" - and "default". -fsync_mode=%s Control the policy of fsync. Currently supports "posix", - "strict", and "nobarrier". In "posix" mode, which is - default, fsync will follow POSIX semantics and does a - light operation to improve the filesystem performance. - In "strict" mode, fsync will be heavy and behaves in line - with xfs, ext4 and btrfs, where xfstest generic/342 will - pass, but the performance will regress. "nobarrier" is - based on "posix", but doesn't issue flush command for - non-atomic files likewise "nobarrier" mount option. -test_dummy_encryption -test_dummy_encryption=%s - Enable dummy encryption, which provides a fake fscrypt - context. The fake fscrypt context is used by xfstests. - The argument may be either "v1" or "v2", in order to - select the corresponding fscrypt policy version. -checkpoint=%s[:%u[%]] Set to "disable" to turn off checkpointing. Set to "enable" - to reenable checkpointing. Is enabled by default. While - disabled, any unmounting or unexpected shutdowns will cause - the filesystem contents to appear as they did when the - filesystem was mounted with that option. - While mounting with checkpoint=disabled, the filesystem must - run garbage collection to ensure that all available space can - be used. If this takes too much time, the mount may return - EAGAIN. You may optionally add a value to indicate how much - of the disk you would be willing to temporarily give up to - avoid additional garbage collection. This can be given as a - number of blocks, or as a percent. For instance, mounting - with checkpoint=disable:100% would always succeed, but it may - hide up to all remaining free space. The actual space that - would be unusable can be viewed at /sys/fs/f2fs//unusable - This space is reclaimed once checkpoint=enable. -compress_algorithm=%s Control compress algorithm, currently f2fs supports "lzo", - "lz4" and "zstd" algorithm. -compress_log_size=%u Support configuring compress cluster size, the size will - be 4KB * (1 << %u), 16KB is minimum size, also it's - default size. -compress_extension=%s Support adding specified extension, so that f2fs can enable - compression on those corresponding files, e.g. if all files - with '.ext' has high compression rate, we can set the '.ext' - on compression extension list and enable compression on - these file by default rather than to enable it via ioctl. - For other files, we can still enable compression via ioctl. - -================================================================================ -DEBUGFS ENTRIES -================================================================================ - -/sys/kernel/debug/f2fs/ contains information about all the partitions mounted as -f2fs. Each file shows the whole f2fs information. - -/sys/kernel/debug/f2fs/status includes: - - major file system information managed by f2fs currently - - average SIT information about whole segments - - current memory footprint consumed by f2fs. - -================================================================================ -SYSFS ENTRIES -================================================================================ - -Information about mounted f2fs file systems can be found in -/sys/fs/f2fs. Each mounted filesystem will have a directory in -/sys/fs/f2fs based on its device name (i.e., /sys/fs/f2fs/sda). -The files in each per-device directory are shown in table below. - -Files in /sys/fs/f2fs/ -(see also Documentation/ABI/testing/sysfs-fs-f2fs) - -================================================================================ -USAGE -================================================================================ - -1. Download userland tools and compile them. - -2. Skip, if f2fs was compiled statically inside kernel. - Otherwise, insert the f2fs.ko module. - # insmod f2fs.ko - -3. Create a directory trying to mount - # mkdir /mnt/f2fs - -4. Format the block device, and then mount as f2fs - # mkfs.f2fs -l label /dev/block_device - # mount -t f2fs /dev/block_device /mnt/f2fs - -mkfs.f2fs ---------- -The mkfs.f2fs is for the use of formatting a partition as the f2fs filesystem, -which builds a basic on-disk layout. - -The options consist of: --l [label] : Give a volume label, up to 512 unicode name. --a [0 or 1] : Split start location of each area for heap-based allocation. - 1 is set by default, which performs this. --o [int] : Set overprovision ratio in percent over volume size. - 5 is set by default. --s [int] : Set the number of segments per section. - 1 is set by default. --z [int] : Set the number of sections per zone. - 1 is set by default. --e [str] : Set basic extension list. e.g. "mp3,gif,mov" --t [0 or 1] : Disable discard command or not. - 1 is set by default, which conducts discard. - -fsck.f2fs ---------- -The fsck.f2fs is a tool to check the consistency of an f2fs-formatted -partition, which examines whether the filesystem metadata and user-made data -are cross-referenced correctly or not. -Note that, initial version of the tool does not fix any inconsistency. - -The options consist of: - -d debug level [default:0] - -dump.f2fs ---------- -The dump.f2fs shows the information of specific inode and dumps SSA and SIT to -file. Each file is dump_ssa and dump_sit. - -The dump.f2fs is used to debug on-disk data structures of the f2fs filesystem. -It shows on-disk inode information recognized by a given inode number, and is -able to dump all the SSA and SIT entries into predefined files, ./dump_ssa and -./dump_sit respectively. - -The options consist of: - -d debug level [default:0] - -i inode no (hex) - -s [SIT dump segno from #1~#2 (decimal), for all 0~-1] - -a [SSA dump segno from #1~#2 (decimal), for all 0~-1] - -Examples: -# dump.f2fs -i [ino] /dev/sdx -# dump.f2fs -s 0~-1 /dev/sdx (SIT dump) -# dump.f2fs -a 0~-1 /dev/sdx (SSA dump) - -================================================================================ -DESIGN -================================================================================ - -On-disk Layout --------------- - -F2FS divides the whole volume into a number of segments, each of which is fixed -to 2MB in size. A section is composed of consecutive segments, and a zone -consists of a set of sections. By default, section and zone sizes are set to one -segment size identically, but users can easily modify the sizes by mkfs. - -F2FS splits the entire volume into six areas, and all the areas except superblock -consists of multiple segments as described below. - - align with the zone size <-| - |-> align with the segment size - _________________________________________________________________________ - | | | Segment | Node | Segment | | - | Superblock | Checkpoint | Info. | Address | Summary | Main | - | (SB) | (CP) | Table (SIT) | Table (NAT) | Area (SSA) | | - |____________|_____2______|______N______|______N______|______N_____|__N___| - . . - . . - . . - ._________________________________________. - |_Segment_|_..._|_Segment_|_..._|_Segment_| - . . - ._________._________ - |_section_|__...__|_ - . . - .________. - |__zone__| - -- Superblock (SB) - : It is located at the beginning of the partition, and there exist two copies - to avoid file system crash. It contains basic partition information and some - default parameters of f2fs. - -- Checkpoint (CP) - : It contains file system information, bitmaps for valid NAT/SIT sets, orphan - inode lists, and summary entries of current active segments. - -- Segment Information Table (SIT) - : It contains segment information such as valid block count and bitmap for the - validity of all the blocks. - -- Node Address Table (NAT) - : It is composed of a block address table for all the node blocks stored in - Main area. - -- Segment Summary Area (SSA) - : It contains summary entries which contains the owner information of all the - data and node blocks stored in Main area. - -- Main Area - : It contains file and directory data including their indices. - -In order to avoid misalignment between file system and flash-based storage, F2FS -aligns the start block address of CP with the segment size. Also, it aligns the -start block address of Main area with the zone size by reserving some segments -in SSA area. - -Reference the following survey for additional technical details. -https://wiki.linaro.org/WorkingGroups/Kernel/Projects/FlashCardSurvey - -File System Metadata Structure ------------------------------- - -F2FS adopts the checkpointing scheme to maintain file system consistency. At -mount time, F2FS first tries to find the last valid checkpoint data by scanning -CP area. In order to reduce the scanning time, F2FS uses only two copies of CP. -One of them always indicates the last valid data, which is called as shadow copy -mechanism. In addition to CP, NAT and SIT also adopt the shadow copy mechanism. - -For file system consistency, each CP points to which NAT and SIT copies are -valid, as shown as below. - - +--------+----------+---------+ - | CP | SIT | NAT | - +--------+----------+---------+ - . . . . - . . . . - . . . . - +-------+-------+--------+--------+--------+--------+ - | CP #0 | CP #1 | SIT #0 | SIT #1 | NAT #0 | NAT #1 | - +-------+-------+--------+--------+--------+--------+ - | ^ ^ - | | | - `----------------------------------------' - -Index Structure ---------------- - -The key data structure to manage the data locations is a "node". Similar to -traditional file structures, F2FS has three types of node: inode, direct node, -indirect node. F2FS assigns 4KB to an inode block which contains 923 data block -indices, two direct node pointers, two indirect node pointers, and one double -indirect node pointer as described below. One direct node block contains 1018 -data blocks, and one indirect node block contains also 1018 node blocks. Thus, -one inode block (i.e., a file) covers: - - 4KB * (923 + 2 * 1018 + 2 * 1018 * 1018 + 1018 * 1018 * 1018) := 3.94TB. - - Inode block (4KB) - |- data (923) - |- direct node (2) - | `- data (1018) - |- indirect node (2) - | `- direct node (1018) - | `- data (1018) - `- double indirect node (1) - `- indirect node (1018) - `- direct node (1018) - `- data (1018) - -Note that, all the node blocks are mapped by NAT which means the location of -each node is translated by the NAT table. In the consideration of the wandering -tree problem, F2FS is able to cut off the propagation of node updates caused by -leaf data writes. - -Directory Structure -------------------- - -A directory entry occupies 11 bytes, which consists of the following attributes. - -- hash hash value of the file name -- ino inode number -- len the length of file name -- type file type such as directory, symlink, etc - -A dentry block consists of 214 dentry slots and file names. Therein a bitmap is -used to represent whether each dentry is valid or not. A dentry block occupies -4KB with the following composition. - - Dentry Block(4 K) = bitmap (27 bytes) + reserved (3 bytes) + - dentries(11 * 214 bytes) + file name (8 * 214 bytes) - - [Bucket] - +--------------------------------+ - |dentry block 1 | dentry block 2 | - +--------------------------------+ - . . - . . - . [Dentry Block Structure: 4KB] . - +--------+----------+----------+------------+ - | bitmap | reserved | dentries | file names | - +--------+----------+----------+------------+ - [Dentry Block: 4KB] . . - . . - . . - +------+------+-----+------+ - | hash | ino | len | type | - +------+------+-----+------+ - [Dentry Structure: 11 bytes] - -F2FS implements multi-level hash tables for directory structure. Each level has -a hash table with dedicated number of hash buckets as shown below. Note that -"A(2B)" means a bucket includes 2 data blocks. - ----------------------- -A : bucket -B : block -N : MAX_DIR_HASH_DEPTH ----------------------- - -level #0 | A(2B) - | -level #1 | A(2B) - A(2B) - | -level #2 | A(2B) - A(2B) - A(2B) - A(2B) - . | . . . . -level #N/2 | A(2B) - A(2B) - A(2B) - A(2B) - A(2B) - ... - A(2B) - . | . . . . -level #N | A(4B) - A(4B) - A(4B) - A(4B) - A(4B) - ... - A(4B) - -The number of blocks and buckets are determined by, - - ,- 2, if n < MAX_DIR_HASH_DEPTH / 2, - # of blocks in level #n = | - `- 4, Otherwise - - ,- 2^(n + dir_level), - | if n + dir_level < MAX_DIR_HASH_DEPTH / 2, - # of buckets in level #n = | - `- 2^((MAX_DIR_HASH_DEPTH / 2) - 1), - Otherwise - -When F2FS finds a file name in a directory, at first a hash value of the file -name is calculated. Then, F2FS scans the hash table in level #0 to find the -dentry consisting of the file name and its inode number. If not found, F2FS -scans the next hash table in level #1. In this way, F2FS scans hash tables in -each levels incrementally from 1 to N. In each levels F2FS needs to scan only -one bucket determined by the following equation, which shows O(log(# of files)) -complexity. - - bucket number to scan in level #n = (hash value) % (# of buckets in level #n) - -In the case of file creation, F2FS finds empty consecutive slots that cover the -file name. F2FS searches the empty slots in the hash tables of whole levels from -1 to N in the same way as the lookup operation. - -The following figure shows an example of two cases holding children. - --------------> Dir <-------------- - | | - child child - - child - child [hole] - child - - child - child - child [hole] - [hole] - child - - Case 1: Case 2: - Number of children = 6, Number of children = 3, - File size = 7 File size = 7 - -Default Block Allocation ------------------------- - -At runtime, F2FS manages six active logs inside "Main" area: Hot/Warm/Cold node -and Hot/Warm/Cold data. - -- Hot node contains direct node blocks of directories. -- Warm node contains direct node blocks except hot node blocks. -- Cold node contains indirect node blocks -- Hot data contains dentry blocks -- Warm data contains data blocks except hot and cold data blocks -- Cold data contains multimedia data or migrated data blocks - -LFS has two schemes for free space management: threaded log and copy-and-compac- -tion. The copy-and-compaction scheme which is known as cleaning, is well-suited -for devices showing very good sequential write performance, since free segments -are served all the time for writing new data. However, it suffers from cleaning -overhead under high utilization. Contrarily, the threaded log scheme suffers -from random writes, but no cleaning process is needed. F2FS adopts a hybrid -scheme where the copy-and-compaction scheme is adopted by default, but the -policy is dynamically changed to the threaded log scheme according to the file -system status. - -In order to align F2FS with underlying flash-based storage, F2FS allocates a -segment in a unit of section. F2FS expects that the section size would be the -same as the unit size of garbage collection in FTL. Furthermore, with respect -to the mapping granularity in FTL, F2FS allocates each section of the active -logs from different zones as much as possible, since FTL can write the data in -the active logs into one allocation unit according to its mapping granularity. - -Cleaning process ----------------- - -F2FS does cleaning both on demand and in the background. On-demand cleaning is -triggered when there are not enough free segments to serve VFS calls. Background -cleaner is operated by a kernel thread, and triggers the cleaning job when the -system is idle. - -F2FS supports two victim selection policies: greedy and cost-benefit algorithms. -In the greedy algorithm, F2FS selects a victim segment having the smallest number -of valid blocks. In the cost-benefit algorithm, F2FS selects a victim segment -according to the segment age and the number of valid blocks in order to address -log block thrashing problem in the greedy algorithm. F2FS adopts the greedy -algorithm for on-demand cleaner, while background cleaner adopts cost-benefit -algorithm. - -In order to identify whether the data in the victim segment are valid or not, -F2FS manages a bitmap. Each bit represents the validity of a block, and the -bitmap is composed of a bit stream covering whole blocks in main area. - -Write-hint Policy ------------------ - -1) whint_mode=off. F2FS only passes down WRITE_LIFE_NOT_SET. - -2) whint_mode=user-based. F2FS tries to pass down hints given by -users. - -User F2FS Block ----- ---- ----- - META WRITE_LIFE_NOT_SET - HOT_NODE " - WARM_NODE " - COLD_NODE " -*ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME -*extension list " " - --- buffered io -WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME -WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT -WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET -WRITE_LIFE_NONE " " -WRITE_LIFE_MEDIUM " " -WRITE_LIFE_LONG " " - --- direct io -WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME -WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT -WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET -WRITE_LIFE_NONE " WRITE_LIFE_NONE -WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM -WRITE_LIFE_LONG " WRITE_LIFE_LONG - -3) whint_mode=fs-based. F2FS passes down hints with its policy. - -User F2FS Block ----- ---- ----- - META WRITE_LIFE_MEDIUM; - HOT_NODE WRITE_LIFE_NOT_SET - WARM_NODE " - COLD_NODE WRITE_LIFE_NONE -ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME -extension list " " - --- buffered io -WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME -WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT -WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_LONG -WRITE_LIFE_NONE " " -WRITE_LIFE_MEDIUM " " -WRITE_LIFE_LONG " " - --- direct io -WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME -WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT -WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET -WRITE_LIFE_NONE " WRITE_LIFE_NONE -WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM -WRITE_LIFE_LONG " WRITE_LIFE_LONG - -Fallocate(2) Policy -------------------- - -The default policy follows the below posix rule. - -Allocating disk space - The default operation (i.e., mode is zero) of fallocate() allocates - the disk space within the range specified by offset and len. The - file size (as reported by stat(2)) will be changed if offset+len is - greater than the file size. Any subregion within the range specified - by offset and len that did not contain data before the call will be - initialized to zero. This default behavior closely resembles the - behavior of the posix_fallocate(3) library function, and is intended - as a method of optimally implementing that function. - -However, once F2FS receives ioctl(fd, F2FS_IOC_SET_PIN_FILE) in prior to -fallocate(fd, DEFAULT_MODE), it allocates on-disk blocks addressess having -zero or random data, which is useful to the below scenario where: - 1. create(fd) - 2. ioctl(fd, F2FS_IOC_SET_PIN_FILE) - 3. fallocate(fd, 0, 0, size) - 4. address = fibmap(fd, offset) - 5. open(blkdev) - 6. write(blkdev, address) - -Compression implementation --------------------------- - -- New term named cluster is defined as basic unit of compression, file can -be divided into multiple clusters logically. One cluster includes 4 << n -(n >= 0) logical pages, compression size is also cluster size, each of -cluster can be compressed or not. - -- In cluster metadata layout, one special block address is used to indicate -cluster is compressed one or normal one, for compressed cluster, following -metadata maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs -stores data including compress header and compressed data. - -- In order to eliminate write amplification during overwrite, F2FS only -support compression on write-once file, data can be compressed only when -all logical blocks in file are valid and cluster compress ratio is lower -than specified threshold. - -- To enable compression on regular inode, there are three ways: -* chattr +c file -* chattr +c dir; touch dir/file -* mount w/ -o compress_extension=ext; touch file.ext - -Compress metadata layout: - [Dnode Structure] - +-----------------------------------------------+ - | cluster 1 | cluster 2 | ......... | cluster N | - +-----------------------------------------------+ - . . . . - . . . . - . Compressed Cluster . . Normal Cluster . -+----------+---------+---------+---------+ +---------+---------+---------+---------+ -|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 | -+----------+---------+---------+---------+ +---------+---------+---------+---------+ - . . - . . - . . - +-------------+-------------+----------+----------------------------+ - | data length | data chksum | reserved | compressed data | - +-------------+-------------+----------+----------------------------+ From ae6e619e82e2d5570c9943600b9fa241b292ca50 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Thu, 9 Jan 2020 14:30:41 +0100 Subject: [PATCH 189/580] fs: Enable bmap() function to properly return errors By now, bmap() will either return the physical block number related to the requested file offset or 0 in case of error or the requested offset maps into a hole. This patch makes the needed changes to enable bmap() to proper return errors, using the return value as an error return, and now, a pointer must be passed to bmap() to be filled with the mapped physical block. It will change the behavior of bmap() on return: - negative value in case of error - zero on success or map fell into a hole In case of a hole, the *block will be zero too Since this is a prep patch, by now, the only error return is -EINVAL if ->bmap doesn't exist. Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino Signed-off-by: Al Viro --- drivers/md/md-bitmap.c | 16 ++++++++++------ fs/f2fs/data.c | 16 +++++++++++----- fs/inode.c | 30 ++++++++++++++++++------------ fs/jbd2/journal.c | 22 +++++++++++++++------- include/linux/fs.h | 9 ++++++++- mm/page_io.c | 11 +++++++---- 6 files changed, 69 insertions(+), 35 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 2fc8c113977f..0bbd55f45b25 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -363,7 +363,7 @@ static int read_page(struct file *file, unsigned long index, int ret = 0; struct inode *inode = file_inode(file); struct buffer_head *bh; - sector_t block; + sector_t block, blk_cur; pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE, (unsigned long long)index << PAGE_SHIFT); @@ -374,17 +374,21 @@ static int read_page(struct file *file, unsigned long index, goto out; } attach_page_buffers(page, bh); - block = index << (PAGE_SHIFT - inode->i_blkbits); + blk_cur = index << (PAGE_SHIFT - inode->i_blkbits); while (bh) { + block = blk_cur; + if (count == 0) bh->b_blocknr = 0; else { - bh->b_blocknr = bmap(inode, block); - if (bh->b_blocknr == 0) { - /* Cannot use this file! */ + ret = bmap(inode, &block); + if (ret || !block) { ret = -EINVAL; + bh->b_blocknr = 0; goto out; } + + bh->b_blocknr = block; bh->b_bdev = inode->i_sb->s_bdev; if (count < (1<i_blkbits)) count = 0; @@ -398,7 +402,7 @@ static int read_page(struct file *file, unsigned long index, set_buffer_mapped(bh); submit_bh(REQ_OP_READ, 0, bh); } - block++; + blk_cur++; bh = bh->b_this_page; } page->index = index; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index e1352ec0f210..a06fbcc38c8c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3880,12 +3880,16 @@ static int check_swap_activate(struct swap_info_struct *sis, page_no < sis->max) { unsigned block_in_page; sector_t first_block; + sector_t block = 0; + int err = 0; cond_resched(); - first_block = bmap(inode, probe_block); - if (first_block == 0) + block = probe_block; + err = bmap(inode, &block); + if (err || !block) goto bad_bmap; + first_block = block; /* * It must be PAGE_SIZE aligned on-disk @@ -3897,11 +3901,13 @@ static int check_swap_activate(struct swap_info_struct *sis, for (block_in_page = 1; block_in_page < blocks_per_page; block_in_page++) { - sector_t block; - block = bmap(inode, probe_block + block_in_page); - if (block == 0) + block = probe_block + block_in_page; + err = bmap(inode, &block); + + if (err || !block) goto bad_bmap; + if (block != first_block + block_in_page) { /* Discontiguity */ probe_block++; diff --git a/fs/inode.c b/fs/inode.c index e3195a2821c8..961808880f43 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1576,25 +1576,31 @@ void iput(struct inode *inode) } EXPORT_SYMBOL(iput); +#ifdef CONFIG_BLOCK /** * bmap - find a block number in a file - * @inode: inode of file - * @block: block to find + * @inode: inode owning the block number being requested + * @block: pointer containing the block to find * - * Returns the block number on the device holding the inode that - * is the disk block number for the block of the file requested. - * That is, asked for block 4 of inode 1 the function will return the - * disk block relative to the disk start that holds that block of the - * file. + * Replaces the value in *block with the block number on the device holding + * corresponding to the requested block number in the file. + * That is, asked for block 4 of inode 1 the function will replace the + * 4 in *block, with disk block relative to the disk start that holds that + * block of the file. + * + * Returns -EINVAL in case of error, 0 otherwise. If mapping falls into a + * hole, returns 0 and *block is also set to 0. */ -sector_t bmap(struct inode *inode, sector_t block) +int bmap(struct inode *inode, sector_t *block) { - sector_t res = 0; - if (inode->i_mapping->a_ops->bmap) - res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block); - return res; + if (!inode->i_mapping->a_ops->bmap) + return -EINVAL; + + *block = inode->i_mapping->a_ops->bmap(inode->i_mapping, *block); + return 0; } EXPORT_SYMBOL(bmap); +#endif /* * With relative atime, only update atime if the previous atime is diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 8ef6b6daaa7a..7acaf6f55404 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -814,18 +814,23 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr, { int err = 0; unsigned long long ret; + sector_t block = 0; if (journal->j_inode) { - ret = bmap(journal->j_inode, blocknr); - if (ret) - *retp = ret; - else { + block = blocknr; + ret = bmap(journal->j_inode, &block); + + if (ret || !block) { printk(KERN_ALERT "%s: journal block not found " "at offset %lu on %s\n", __func__, blocknr, journal->j_devname); err = -EIO; __journal_abort_soft(journal, err); + + } else { + *retp = block; } + } else { *retp = blocknr; /* +journal->j_blk_offset */ } @@ -1251,11 +1256,14 @@ journal_t *jbd2_journal_init_dev(struct block_device *bdev, journal_t *jbd2_journal_init_inode(struct inode *inode) { journal_t *journal; + sector_t blocknr; char *p; - unsigned long long blocknr; + int err = 0; - blocknr = bmap(inode, 0); - if (!blocknr) { + blocknr = 0; + err = bmap(inode, &blocknr); + + if (err || !blocknr) { pr_err("%s: Cannot locate journal superblock\n", __func__); return NULL; diff --git a/include/linux/fs.h b/include/linux/fs.h index a54ccfb9ea4a..78a70cb337ae 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2784,9 +2784,16 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count) extern void emergency_sync(void); extern void emergency_remount(void); + #ifdef CONFIG_BLOCK -extern sector_t bmap(struct inode *, sector_t); +extern int bmap(struct inode *inode, sector_t *block); +#else +static inline int bmap(struct inode *inode, sector_t *block) +{ + return -EINVAL; +} #endif + extern int notify_change(struct dentry *, struct iattr *, struct inode **); extern int inode_permission(struct inode *, int); extern int generic_permission(struct inode *, int); diff --git a/mm/page_io.c b/mm/page_io.c index aafd19ec1db4..994c996414c3 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -177,8 +177,9 @@ int generic_swapfile_activate(struct swap_info_struct *sis, cond_resched(); - first_block = bmap(inode, probe_block); - if (first_block == 0) + first_block = probe_block; + ret = bmap(inode, &first_block); + if (ret || !first_block) goto bad_bmap; /* @@ -193,9 +194,11 @@ int generic_swapfile_activate(struct swap_info_struct *sis, block_in_page++) { sector_t block; - block = bmap(inode, probe_block + block_in_page); - if (block == 0) + block = probe_block + block_in_page; + ret = bmap(inode, &block); + if (ret || !block) goto bad_bmap; + if (block != first_block + block_in_page) { /* Discontiguity */ probe_block++; From d9dd046bbadbfa96e90fd453be02b04608fbff6e Mon Sep 17 00:00:00 2001 From: jiahao Date: Fri, 19 Feb 2021 20:46:32 +0800 Subject: [PATCH 190/580] f2fs: fix a spacing coding style Add a space before the plus. Signed-off-by: jiahao Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index c7fc8a33616f..762c035d744a 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1456,7 +1456,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) orphan_blocks); if (__remain_node_summaries(cpc->reason)) - ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+ + ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS + cp_payload_blks + data_sum_blocks + orphan_blocks + NR_CURSEG_NODE_TYPE); else From 72e856797369f33b44588eea1015d5c52374fe8f Mon Sep 17 00:00:00 2001 From: xuyehan Date: Tue, 23 Feb 2021 09:31:43 +0800 Subject: [PATCH 191/580] f2fs: fix a spelling error Delete the letter 'e' before 'number' Signed-off-by: xuyehan Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 8c90e0764050..4ada9899ad84 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -266,7 +266,7 @@ Date April 2019 Contact: "Daniel Rosenberg" Description: If checkpoint=disable, it displays the number of blocks that are unusable. - If checkpoint=enable it displays the enumber of blocks that + If checkpoint=enable it displays the number of blocks that would be unusable if checkpoint=disable were to be set. What: /sys/fs/f2fs//encoding From 8e777aef953499b9cfa1673b06597d78c0f775e1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 20 Feb 2021 17:35:40 +0800 Subject: [PATCH 192/580] f2fs: fix to allow migrating fully valid segment F2FS_IOC_FLUSH_DEVICE/F2FS_IOC_RESIZE_FS needs to migrate all blocks of target segment to other place, no matter the segment has partially or fully valid blocks. However, after commit 803e74be04b3 ("f2fs: stop GC when the victim becomes fully valid"), we may skip migration due to target segment is fully valid, result in failing the ioctl interface, fix this. Fixes: 803e74be04b3 ("f2fs: stop GC when the victim becomes fully valid") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 9 +++++---- fs/f2fs/gc.c | 21 ++++++++++++--------- fs/f2fs/segment.c | 2 +- fs/f2fs/super.c | 2 +- 5 files changed, 20 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 392bfd8acd34..4106560575e1 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3558,7 +3558,7 @@ void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi); int f2fs_start_gc_thread(struct f2fs_sb_info *sbi); void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi); block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode); -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, bool force, unsigned int segno); void f2fs_build_gc_manager(struct f2fs_sb_info *sbi); int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ae9bc814fc11..9bb03379241b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1678,7 +1678,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset, if (has_not_enough_free_secs(sbi, 0, GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { down_write(&sbi->gc_lock); - err = f2fs_gc(sbi, true, false, NULL_SEGNO); + err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); if (err && err != -ENODATA && err != -EAGAIN) goto out_err; } @@ -2513,7 +2513,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) down_write(&sbi->gc_lock); } - ret = f2fs_gc(sbi, sync, true, NULL_SEGNO); + ret = f2fs_gc(sbi, sync, true, false, NULL_SEGNO); out: mnt_drop_write_file(filp); return ret; @@ -2549,7 +2549,8 @@ static int __f2fs_ioc_gc_range(struct file *filp, struct f2fs_gc_range *range) down_write(&sbi->gc_lock); } - ret = f2fs_gc(sbi, range->sync, true, GET_SEGNO(sbi, range->start)); + ret = f2fs_gc(sbi, range->sync, true, false, + GET_SEGNO(sbi, range->start)); if (ret) { if (ret == -EBUSY) ret = -EAGAIN; @@ -3002,7 +3003,7 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) sm->last_victim[GC_CB] = end_segno + 1; sm->last_victim[GC_GREEDY] = end_segno + 1; sm->last_victim[ALLOC_NEXT] = end_segno + 1; - ret = f2fs_gc(sbi, true, true, start_segno); + ret = f2fs_gc(sbi, true, true, true, start_segno); if (ret == -EAGAIN) ret = 0; else if (ret < 0) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 39330ad3c44e..b3af76340026 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -112,7 +112,7 @@ static int gc_thread_func(void *data) sync_mode = F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC; /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, sync_mode, true, NULL_SEGNO)) + if (f2fs_gc(sbi, sync_mode, true, false, NULL_SEGNO)) wait_ms = gc_th->no_gc_sleep_time; trace_f2fs_background_gc(sbi->sb, wait_ms, @@ -1354,7 +1354,8 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, * the victim data block is ignored. */ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, - struct gc_inode_list *gc_list, unsigned int segno, int gc_type) + struct gc_inode_list *gc_list, unsigned int segno, int gc_type, + bool force_migrate) { struct super_block *sb = sbi->sb; struct f2fs_summary *entry; @@ -1383,8 +1384,8 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, * race condition along with SSR block allocation. */ if ((gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) || - get_valid_blocks(sbi, segno, true) == - BLKS_PER_SEC(sbi)) + (!force_migrate && get_valid_blocks(sbi, segno, true) == + BLKS_PER_SEC(sbi))) return submitted; if (check_valid_map(sbi, segno, off) == 0) @@ -1519,7 +1520,8 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int start_segno, - struct gc_inode_list *gc_list, int gc_type) + struct gc_inode_list *gc_list, int gc_type, + bool force_migrate) { struct page *sum_page; struct f2fs_summary_block *sum; @@ -1606,7 +1608,8 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, gc_type); else submitted += gc_data_segment(sbi, sum->entries, gc_list, - segno, gc_type); + segno, gc_type, + force_migrate); stat_inc_seg_count(sbi, type, gc_type); migrated++; @@ -1634,7 +1637,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, } int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, - bool background, unsigned int segno) + bool background, bool force, unsigned int segno) { int gc_type = sync ? FG_GC : BG_GC; int sec_freed = 0, seg_freed = 0, total_freed = 0; @@ -1696,7 +1699,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, if (ret) goto stop; - seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type); + seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type, force); if (gc_type == FG_GC && seg_freed == f2fs_usable_segs_in_sec(sbi, segno)) sec_freed++; @@ -1835,7 +1838,7 @@ static int free_segment_range(struct f2fs_sb_info *sbi, .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; - do_garbage_collect(sbi, segno, &gc_list, FG_GC); + do_garbage_collect(sbi, segno, &gc_list, FG_GC, true); put_gc_inode(&gc_list); if (!gc_only && get_valid_blocks(sbi, segno, true)) { diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c6ea8bed9502..1638c8ad91d8 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -504,7 +504,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) */ if (has_not_enough_free_secs(sbi, 0, 0)) { down_write(&sbi->gc_lock); - f2fs_gc(sbi, false, false, NULL_SEGNO); + f2fs_gc(sbi, false, false, false, NULL_SEGNO); } } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f43fad1cb666..9d4a228dfd52 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1852,7 +1852,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) while (!f2fs_time_over(sbi, DISABLE_TIME)) { down_write(&sbi->gc_lock); - err = f2fs_gc(sbi, true, false, NULL_SEGNO); + err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); if (err == -ENODATA) { err = 0; break; From 546ce0066f72c22f9350bea8401d2d46eb2e6f90 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 20 Feb 2021 17:35:41 +0800 Subject: [PATCH 193/580] f2fs: fix panic during f2fs_resize_fs() f2fs_resize_fs() hangs in below callstack with testcase: - mkfs 16GB image & mount image - dd 8GB fileA - dd 8GB fileB - sync - rm fileA - sync - resize filesystem to 8GB kernel BUG at segment.c:2484! Call Trace: allocate_segment_by_default+0x92/0xf0 [f2fs] f2fs_allocate_data_block+0x44b/0x7e0 [f2fs] do_write_page+0x5a/0x110 [f2fs] f2fs_outplace_write_data+0x55/0x100 [f2fs] f2fs_do_write_data_page+0x392/0x850 [f2fs] move_data_page+0x233/0x320 [f2fs] do_garbage_collect+0x14d9/0x1660 [f2fs] free_segment_range+0x1f7/0x310 [f2fs] f2fs_resize_fs+0x118/0x330 [f2fs] __f2fs_ioctl+0x487/0x3680 [f2fs] __x64_sys_ioctl+0x8e/0xd0 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The root cause is we forgot to check that whether we have enough space in resized filesystem to store all valid blocks in before-resizing filesystem, then allocator will run out-of-space during block migration in free_segment_range(). Fixes: b4b10061ef98 ("f2fs: refactor resize_fs to avoid meta updates in progress") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index b3af76340026..86ba8ed0b8a7 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1977,7 +1977,20 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) /* stop CP to protect MAIN_SEC in free_segment_range */ f2fs_lock_op(sbi); + + spin_lock(&sbi->stat_lock); + if (shrunk_blocks + valid_user_blocks(sbi) + + sbi->current_reserved_blocks + sbi->unusable_block_count + + F2FS_OPTION(sbi).root_reserved_blocks > sbi->user_block_count) + err = -ENOSPC; + spin_unlock(&sbi->stat_lock); + + if (err) + goto out_unlock; + err = free_segment_range(sbi, secs, true); + +out_unlock: f2fs_unlock_op(sbi); up_write(&sbi->gc_lock); if (err) From 7b11e7150553e3472b9f52eb442b1835d39da19e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 20 Feb 2021 17:38:41 +0800 Subject: [PATCH 194/580] f2fs: avoid unused f2fs_show_compress_options() LKP reports: fs/f2fs/super.c:1516:20: warning: unused function 'f2fs_show_compress_options' [-Wunused-function] static inline void f2fs_show_compress_options(struct seq_file *seq, Fix this issue by covering f2fs_show_compress_options() with CONFIG_F2FS_FS_COMPRESSION macro. Fixes: 4c8ff7095bef ("f2fs: support data compression") Reported-by: kernel test robot Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 9d4a228dfd52..a2e9130d63f2 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1611,6 +1611,7 @@ static inline void f2fs_show_quota_options(struct seq_file *seq, #endif } +#ifdef CONFIG_F2FS_FS_COMPRESSION static inline void f2fs_show_compress_options(struct seq_file *seq, struct super_block *sb) { @@ -1653,6 +1654,7 @@ static inline void f2fs_show_compress_options(struct seq_file *seq, else if (F2FS_OPTION(sbi).compress_mode == COMPR_MODE_USER) seq_printf(seq, ",compress_mode=%s", "user"); } +#endif static int f2fs_show_options(struct seq_file *seq, struct dentry *root) { From 2ffd22e22364adb5f30d5c3c963e65cf46fad3fe Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 20 Feb 2021 17:38:42 +0800 Subject: [PATCH 195/580] f2fs: remove unused FORCE_FG_GC macro FORCE_FG_GC was introduced by commit 6aefd93b0137 ("f2fs: introduce background_gc=sync mount option"), but never be used, remove it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 229814b4f4a6..144980b62f9e 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -172,12 +172,10 @@ enum { /* * BG_GC means the background cleaning job. * FG_GC means the on-demand cleaning job. - * FORCE_FG_GC means on-demand cleaning job in background. */ enum { BG_GC = 0, FG_GC, - FORCE_FG_GC, }; /* for a function parameter to select a victim segment */ From bf2cd557fac6155db20d7ac62aa89260a3c4cd86 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 20 Feb 2021 17:38:43 +0800 Subject: [PATCH 196/580] f2fs: update comments for explicit memory barrier Add more detailed comments for explicit memory barrier used by f2fs, in order to enhance code readability. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 6 +++++- fs/f2fs/segment.c | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 762c035d744a..31f951e31dca 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1818,7 +1818,11 @@ int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi) llist_add(&req.llnode, &cprc->issue_list); atomic_inc(&cprc->queued_ckpt); - /* update issue_list before we wake up issue_checkpoint thread */ + /* + * update issue_list before we wake up issue_checkpoint thread, + * this smp_mb() pairs with another barrier in ___wait_event(), + * see more details in comments of waitqueue_active(). + */ smp_mb(); if (waitqueue_active(&cprc->ckpt_wait_queue)) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1638c8ad91d8..3902aead8812 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -653,7 +653,11 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino) llist_add(&cmd.llnode, &fcc->issue_list); - /* update issue_list before we wake up issue_flush thread */ + /* + * update issue_list before we wake up issue_flush thread, this + * smp_mb() pairs with another barrier in ___wait_event(), see + * more details in comments of waitqueue_active(). + */ smp_mb(); if (waitqueue_active(&fcc->flush_wait_queue)) From 5cc26ee5358dd58027c007e25f1efccd508848b1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 22 Feb 2021 18:07:33 +0800 Subject: [PATCH 197/580] f2fs: check discard command number before traversing discard pending list In trim thread, let's add a condition to check discard command number before traversing discard pending list, it can avoid unneeded traversing if there is no discard command. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 3902aead8812..cb4e25ab5109 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1759,6 +1759,8 @@ static int issue_discard_thread(void *data) wait_ms = dpolicy.max_interval; continue; } + if (!atomic_read(&dcc->discard_cmd_cnt)) + continue; if (sbi->gc_mode == GC_URGENT_HIGH) __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); From 816876905a273f2779f8e10502a7cc8b09c4cb1c Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 26 Feb 2021 16:51:42 +0100 Subject: [PATCH 198/580] f2fs: compress: Allow modular (de)compression algorithms If F2FS_FS is modular, enabling the compressions options F2FS_FS_{LZ4,LZ4HZ,LZO,LZORLE,ZSTD} will make the (de)compression algorithms {LZ4,LZ4HC,LZO,ZSTD}_{,DE}COMPRESS builtin instead of modular, as the former depend on an intermediate boolean F2FS_FS_COMPRESSION, which in-turn depends on tristate F2FS_FS. Indeed, if a boolean symbol A depends directly on a tristate symbol B and selects another tristate symbol C: tristate B tristate C bool A depends on B select C and B is modular, then C will also be modular. However, if there is an intermediate boolean D in the dependency chain between A and B: tristate B tristate C bool D depends on B bool A depends on D select C then the modular state won't propagate from B to C, and C will be builtin instead of modular. As modular dependency propagation through intermediate symbols is obscure, fix this in a robust way by moving the selection of tristate (de)compression algorithms from the boolean compression options to the tristate main F2FS_FS option. Signed-off-by: Geert Uytterhoeven Reviewed-by: Chao Yu Reviewed-by: Masahiro Yamada Signed-off-by: Jaegeuk Kim --- fs/f2fs/Kconfig | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 5c0c113cf411..db29224df9b3 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -6,6 +6,13 @@ config F2FS_FS select CRYPTO_CRC32 select F2FS_FS_XATTR if FS_ENCRYPTION select FS_ENCRYPTION_ALGS if FS_ENCRYPTION + select LZ4_COMPRESS if F2FS_FS_LZ4 + select LZ4_DECOMPRESS if F2FS_FS_LZ4 + select LZ4HC_COMPRESS if F2FS_FS_LZ4HC + select LZO_COMPRESS if F2FS_FS_LZO + select LZO_DECOMPRESS if F2FS_FS_LZO + select ZSTD_COMPRESS if F2FS_FS_ZSTD + select ZSTD_DECOMPRESS if F2FS_FS_ZSTD help F2FS is based on Log-structured File System (LFS), which supports versatile "flash-friendly" features. The design has been focused on @@ -102,8 +109,6 @@ config F2FS_FS_COMPRESSION config F2FS_FS_LZO bool "LZO compression support" depends on F2FS_FS_COMPRESSION - select LZO_COMPRESS - select LZO_DECOMPRESS default y help Support LZO compress algorithm, if unsure, say Y. @@ -111,8 +116,6 @@ config F2FS_FS_LZO config F2FS_FS_LZ4 bool "LZ4 compression support" depends on F2FS_FS_COMPRESSION - select LZ4_COMPRESS - select LZ4_DECOMPRESS default y help Support LZ4 compress algorithm, if unsure, say Y. @@ -121,7 +124,6 @@ config F2FS_FS_LZ4HC bool "LZ4HC compression support" depends on F2FS_FS_COMPRESSION depends on F2FS_FS_LZ4 - select LZ4HC_COMPRESS default y help Support LZ4HC compress algorithm, LZ4HC has compatible on-disk @@ -130,8 +132,6 @@ config F2FS_FS_LZ4HC config F2FS_FS_ZSTD bool "ZSTD compression support" depends on F2FS_FS_COMPRESSION - select ZSTD_COMPRESS - select ZSTD_DECOMPRESS default y help Support ZSTD compress algorithm, if unsure, say Y. From 9c59a4ad6c2cc90a93861e97cb499a860045a949 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 24 Feb 2021 13:03:13 -0600 Subject: [PATCH 199/580] f2fs: Replace one-element array with flexible-array member MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. Refactor the code according to the use of a flexible-array member in struct f2fs_checkpoint, instead of a one-element arrays. Notice that a temporary pointer to void '*tmp_ptr' was used in order to fix the following errors when using a flexible array instead of a one element array in struct f2fs_checkpoint: CC [M] fs/f2fs/dir.o In file included from fs/f2fs/dir.c:13: fs/f2fs/f2fs.h: In function ‘__bitmap_ptr’: fs/f2fs/f2fs.h:2227:40: error: invalid use of flexible array member 2227 | return &ckpt->sit_nat_version_bitmap + offset + sizeof(__le32); | ^ fs/f2fs/f2fs.h:2227:49: error: invalid use of flexible array member 2227 | return &ckpt->sit_nat_version_bitmap + offset + sizeof(__le32); | ^ fs/f2fs/f2fs.h:2238:40: error: invalid use of flexible array member 2238 | return &ckpt->sit_nat_version_bitmap + offset; | ^ make[2]: *** [scripts/Makefile.build:287: fs/f2fs/dir.o] Error 1 make[1]: *** [scripts/Makefile.build:530: fs/f2fs] Error 2 make: *** [Makefile:1819: fs] Error 2 [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://www.kernel.org/doc/html/v5.9/process/deprecated.html#zero-length-and-one-element-arrays Link: https://github.com/KSPP/linux/issues/79 Build-tested-by: kernel test robot Link: https://lore.kernel.org/lkml/603647e4.DeEFbl4eqljuwAUe%25lkp@intel.com/ Signed-off-by: Gustavo A. R. Silva Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 5 +++-- include/linux/f2fs_fs.h | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4106560575e1..e0592f281cd4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2221,6 +2221,7 @@ static inline block_t __cp_payload(struct f2fs_sb_info *sbi) static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + void *tmp_ptr = &ckpt->sit_nat_version_bitmap; int offset; if (is_set_ckpt_flags(sbi, CP_LARGE_NAT_BITMAP_FLAG)) { @@ -2230,7 +2231,7 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) * if large_nat_bitmap feature is enabled, leave checksum * protection for all nat/sit bitmaps. */ - return &ckpt->sit_nat_version_bitmap + offset + sizeof(__le32); + return tmp_ptr + offset + sizeof(__le32); } if (__cp_payload(sbi) > 0) { @@ -2241,7 +2242,7 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) } else { offset = (flag == NAT_BITMAP) ? le32_to_cpu(ckpt->sit_ver_bitmap_bytesize) : 0; - return &ckpt->sit_nat_version_bitmap + offset; + return tmp_ptr + offset; } } diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index c6cc0a566ef5..5487a80617a3 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -168,7 +168,7 @@ struct f2fs_checkpoint { unsigned char alloc_type[MAX_ACTIVE_LOGS]; /* SIT and NAT version bitmap */ - unsigned char sit_nat_version_bitmap[1]; + unsigned char sit_nat_version_bitmap[]; } __packed; #define CP_CHKSUM_OFFSET 4092 /* default chksum offset in checkpoint */ From 5b3ee5d80a7df0fa1a87614ae5b90f490eb3abb7 Mon Sep 17 00:00:00 2001 From: "huangjianan@oppo.com" Date: Sat, 27 Feb 2021 20:02:29 +0800 Subject: [PATCH 200/580] f2fs: remove unnecessary IS_SWAPFILE check Now swapfile in f2fs directly submit IO to blockdev according to swapfile extents reported by f2fs when swapon, therefore there is no need to check IS_SWAPFILE when exec filesystem operation. Signed-off-by: Huang Jianan Signed-off-by: Guo Weichao Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a06fbcc38c8c..1eb9c337b631 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1684,7 +1684,7 @@ static int get_data_block_dio_write(struct inode *inode, sector_t iblock, return __get_data_block(inode, iblock, bh_result, create, F2FS_GET_BLOCK_DIO, NULL, f2fs_rw_hint_to_seg_type(inode->i_write_hint), - IS_SWAPFILE(inode) ? false : true); + true); } static int get_data_block_dio(struct inode *inode, sector_t iblock, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e0592f281cd4..6522bad78941 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4204,8 +4204,7 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, if (F2FS_IO_ALIGNED(sbi)) return true; } - if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED) && - !IS_SWAPFILE(inode)) + if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED)) return true; return false; From c3e7687817c357aedf536de8a405bc384019c06f Mon Sep 17 00:00:00 2001 From: "huangjianan@oppo.com" Date: Sat, 27 Feb 2021 20:02:30 +0800 Subject: [PATCH 201/580] f2fs: fix last_lblock check in check_swap_activate_fast Because page_no < sis->max guarantees that the while loop break out normally, the wrong check contidion here doesn't cause a problem. Signed-off-by: Huang Jianan Signed-off-by: Guo Weichao Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 1eb9c337b631..1b4e741acd7c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3793,7 +3793,7 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, last_lblock = bytes_to_blks(inode, i_size_read(inode)); len = i_size_read(inode); - while (cur_lblock <= last_lblock && cur_lblock < sis->max) { + while (cur_lblock < last_lblock && cur_lblock < sis->max) { struct f2fs_map_blocks map; pgoff_t next_pgofs; From 619f18260e86d4c118b8833a2f4470e4b477e097 Mon Sep 17 00:00:00 2001 From: "huangjianan@oppo.com" Date: Mon, 1 Mar 2021 12:58:44 +0800 Subject: [PATCH 202/580] f2fs: check if swapfile is section-alligned If the swapfile isn't created by pin and fallocate, it can't be guaranteed section-aligned, so it may be selected by f2fs gc. When gc_pin_file_threshold is reached, the address of swapfile may change, but won't be synchronized to swap_extent, so swap will write to wrong address, which will cause data corruption. Signed-off-by: Huang Jianan Signed-off-by: Guo Weichao Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 109 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 88 insertions(+), 21 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 1b4e741acd7c..62fe53ea75a3 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3770,11 +3770,64 @@ int f2fs_migrate_page(struct address_space *mapping, #endif #ifdef CONFIG_SWAP +static int f2fs_is_file_aligned(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + block_t main_blkaddr = SM_I(sbi)->main_blkaddr; + block_t cur_lblock; + block_t last_lblock; + block_t pblock; + unsigned long nr_pblocks; + unsigned int blocks_per_sec = BLKS_PER_SEC(sbi); + int ret = 0; + + cur_lblock = 0; + last_lblock = bytes_to_blks(inode, i_size_read(inode)); + + while (cur_lblock < last_lblock) { + struct f2fs_map_blocks map; + + memset(&map, 0, sizeof(map)); + map.m_lblk = cur_lblock; + map.m_len = last_lblock - cur_lblock; + map.m_next_pgofs = NULL; + map.m_next_extent = NULL; + map.m_seg_type = NO_CHECK_TYPE; + map.m_may_create = false; + + ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP); + if (ret) + goto out; + + /* hole */ + if (!(map.m_flags & F2FS_MAP_FLAGS)) { + f2fs_err(sbi, "Swapfile has holes\n"); + ret = -ENOENT; + goto out; + } + + pblock = map.m_pblk; + nr_pblocks = map.m_len; + + if ((pblock - main_blkaddr) & (blocks_per_sec - 1) || + nr_pblocks & (blocks_per_sec - 1)) { + f2fs_err(sbi, "Swapfile does not align to section"); + ret = -EINVAL; + goto out; + } + + cur_lblock += nr_pblocks; + } +out: + return ret; +} + static int check_swap_activate_fast(struct swap_info_struct *sis, struct file *swap_file, sector_t *span) { struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); sector_t cur_lblock; sector_t last_lblock; sector_t pblock; @@ -3782,8 +3835,8 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, sector_t highest_pblock = 0; int nr_extents = 0; unsigned long nr_pblocks; - u64 len; - int ret; + unsigned int blocks_per_sec = BLKS_PER_SEC(sbi); + int ret = 0; /* * Map all the blocks into the extent list. This code doesn't try @@ -3791,31 +3844,41 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, */ cur_lblock = 0; last_lblock = bytes_to_blks(inode, i_size_read(inode)); - len = i_size_read(inode); while (cur_lblock < last_lblock && cur_lblock < sis->max) { struct f2fs_map_blocks map; - pgoff_t next_pgofs; cond_resched(); memset(&map, 0, sizeof(map)); map.m_lblk = cur_lblock; - map.m_len = bytes_to_blks(inode, len) - cur_lblock; - map.m_next_pgofs = &next_pgofs; + map.m_len = last_lblock - cur_lblock; + map.m_next_pgofs = NULL; + map.m_next_extent = NULL; map.m_seg_type = NO_CHECK_TYPE; + map.m_may_create = false; ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP); if (ret) - goto err_out; + goto out; /* hole */ - if (!(map.m_flags & F2FS_MAP_FLAGS)) - goto err_out; + if (!(map.m_flags & F2FS_MAP_FLAGS)) { + f2fs_err(sbi, "Swapfile has holes\n"); + ret = -ENOENT; + goto out; + } pblock = map.m_pblk; nr_pblocks = map.m_len; + if ((pblock - SM_I(sbi)->main_blkaddr) & (blocks_per_sec - 1) || + nr_pblocks & (blocks_per_sec - 1)) { + f2fs_err(sbi, "Swapfile does not align to section"); + ret = -EINVAL; + goto out; + } + if (cur_lblock + nr_pblocks >= sis->max) nr_pblocks = sis->max - cur_lblock; @@ -3844,9 +3907,6 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, sis->highest_bit = cur_lblock - 1; out: return ret; -err_out: - pr_err("swapon: swapfile has holes\n"); - return -EINVAL; } /* Copied from generic_swapfile_activate() to check any holes */ @@ -3855,6 +3915,7 @@ static int check_swap_activate(struct swap_info_struct *sis, { struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned blocks_per_page; unsigned long page_no; sector_t probe_block; @@ -3862,11 +3923,15 @@ static int check_swap_activate(struct swap_info_struct *sis, sector_t lowest_block = -1; sector_t highest_block = 0; int nr_extents = 0; - int ret; + int ret = 0; if (PAGE_SIZE == F2FS_BLKSIZE) return check_swap_activate_fast(sis, swap_file, span); + ret = f2fs_is_file_aligned(inode); + if (ret) + goto out; + blocks_per_page = bytes_to_blks(inode, PAGE_SIZE); /* @@ -3881,13 +3946,14 @@ static int check_swap_activate(struct swap_info_struct *sis, unsigned block_in_page; sector_t first_block; sector_t block = 0; - int err = 0; cond_resched(); block = probe_block; - err = bmap(inode, &block); - if (err || !block) + ret = bmap(inode, &block); + if (ret) + goto out; + if (!block) goto bad_bmap; first_block = block; @@ -3903,9 +3969,10 @@ static int check_swap_activate(struct swap_info_struct *sis, block_in_page++) { block = probe_block + block_in_page; - err = bmap(inode, &block); - - if (err || !block) + ret = bmap(inode, &block); + if (ret) + goto out; + if (!block) goto bad_bmap; if (block != first_block + block_in_page) { @@ -3945,8 +4012,8 @@ static int check_swap_activate(struct swap_info_struct *sis, out: return ret; bad_bmap: - pr_err("swapon: swapfile has holes\n"); - return -EINVAL; + f2fs_err(sbi, "Swapfile has holes\n"); + return -ENOENT; } static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, From 344bbc8c7400ccf6bb4105ca8a148bfe59c0d3c3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 2 Mar 2021 16:35:32 +0800 Subject: [PATCH 203/580] f2fs: remove unused file_clear_encrypt() - file_clear_encrypt() was never be used, remove it. - In addition, relocating macros for cleanup. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6522bad78941..65a6e02d581c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -636,21 +636,26 @@ enum { #define FADVISE_MODIFIABLE_BITS (FADVISE_COLD_BIT | FADVISE_HOT_BIT) #define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) -#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) #define file_set_cold(inode) set_file(inode, FADVISE_COLD_BIT) -#define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT) #define file_clear_cold(inode) clear_file(inode, FADVISE_COLD_BIT) + +#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) +#define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT) #define file_got_pino(inode) clear_file(inode, FADVISE_LOST_PINO_BIT) + #define file_is_encrypt(inode) is_file(inode, FADVISE_ENCRYPT_BIT) #define file_set_encrypt(inode) set_file(inode, FADVISE_ENCRYPT_BIT) -#define file_clear_encrypt(inode) clear_file(inode, FADVISE_ENCRYPT_BIT) + #define file_enc_name(inode) is_file(inode, FADVISE_ENC_NAME_BIT) #define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT) + #define file_keep_isize(inode) is_file(inode, FADVISE_KEEP_SIZE_BIT) #define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT) + #define file_is_hot(inode) is_file(inode, FADVISE_HOT_BIT) #define file_set_hot(inode) set_file(inode, FADVISE_HOT_BIT) #define file_clear_hot(inode) clear_file(inode, FADVISE_HOT_BIT) + #define file_is_verity(inode) is_file(inode, FADVISE_VERITY_BIT) #define file_set_verity(inode) set_file(inode, FADVISE_VERITY_BIT) From 153861ab7098b714e9a78a28b75ba245e0f9b0cd Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 4 Mar 2021 09:21:18 +0000 Subject: [PATCH 204/580] f2fs: fix a redundant call to f2fs_balance_fs if an error occurs The uninitialized variable dn.node_changed does not get set when a call to f2fs_get_node_page fails. This uninitialized value gets used in the call to f2fs_balance_fs() that may or not may not balances dirty node and dentry pages depending on the uninitialized state of the variable. Fix this by only calling f2fs_balance_fs if err is not set. Thanks to Jaegeuk Kim for suggesting an appropriate fix. Addresses-Coverity: ("Uninitialized scalar variable") Fixes: 2a3407607028 ("f2fs: call f2fs_balance_fs only when node was changed") Signed-off-by: Colin Ian King Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inline.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 936df9342fcd..38011ef34ac0 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -218,7 +218,8 @@ int f2fs_convert_inline_inode(struct inode *inode) f2fs_put_page(page, 1); - f2fs_balance_fs(sbi, dn.node_changed); + if (!err) + f2fs_balance_fs(sbi, dn.node_changed); return err; } From d2a279a043606664f3b208d256f0825a6ce35ed1 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 4 Mar 2021 21:43:10 -0800 Subject: [PATCH 205/580] f2fs: fix error handling in f2fs_end_enable_verity() f2fs didn't properly clean up if verity failed to be enabled on a file: - It left verity metadata (pages past EOF) in the page cache, which would be exposed to userspace if the file was later extended. - It didn't truncate the verity metadata at all (either from cache or from disk) if an error occurred while setting the verity bit. Fix these bugs by adding a call to truncate_inode_pages() and ensuring that we truncate the verity metadata (both from cache and from disk) in all error paths. Also rework the code to cleanly separate the success path from the error paths, which makes it much easier to understand. Finally, log a message if f2fs_truncate() fails, since it might otherwise fail silently. Reported-by: Yunlei He Fixes: 95ae251fe828 ("f2fs: add fs-verity support") Cc: # v5.4+ Signed-off-by: Eric Biggers Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/verity.c | 75 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 54 insertions(+), 21 deletions(-) diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index 95be8222e2eb..fb552a1eec81 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -152,40 +152,73 @@ static int f2fs_end_enable_verity(struct file *filp, const void *desc, size_t desc_size, u64 merkle_tree_size) { struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); u64 desc_pos = f2fs_verity_metadata_pos(inode) + merkle_tree_size; struct fsverity_descriptor_location dloc = { .version = cpu_to_le32(F2FS_VERIFY_VER), .size = cpu_to_le32(desc_size), .pos = cpu_to_le64(desc_pos), }; - int err = 0; + int err = 0, err2 = 0; - if (desc != NULL) { - /* Succeeded; write the verity descriptor. */ - err = pagecache_write(inode, desc, desc_size, desc_pos); + /* + * If an error already occurred (which fs/verity/ signals by passing + * desc == NULL), then only clean-up is needed. + */ + if (desc == NULL) + goto cleanup; - /* Write all pages before clearing FI_VERITY_IN_PROGRESS. */ - if (!err) - err = filemap_write_and_wait(inode->i_mapping); - } + /* Append the verity descriptor. */ + err = pagecache_write(inode, desc, desc_size, desc_pos); + if (err) + goto cleanup; - /* If we failed, truncate anything we wrote past i_size. */ - if (desc == NULL || err) - f2fs_truncate(inode); + /* + * Write all pages (both data and verity metadata). Note that this must + * happen before clearing FI_VERITY_IN_PROGRESS; otherwise pages beyond + * i_size won't be written properly. For crash consistency, this also + * must happen before the verity inode flag gets persisted. + */ + err = filemap_write_and_wait(inode->i_mapping); + if (err) + goto cleanup; + + /* Set the verity xattr. */ + err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_VERITY, + F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc), + NULL, XATTR_CREATE); + if (err) + goto cleanup; + + /* Finally, set the verity inode flag. */ + file_set_verity(inode); + f2fs_set_inode_flags(inode); + f2fs_mark_inode_dirty_sync(inode, true); clear_inode_flag(inode, FI_VERITY_IN_PROGRESS); + return 0; - if (desc != NULL && !err) { - err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_VERITY, - F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc), - NULL, XATTR_CREATE); - if (!err) { - file_set_verity(inode); - f2fs_set_inode_flags(inode); - f2fs_mark_inode_dirty_sync(inode, true); - } +cleanup: + /* + * Verity failed to be enabled, so clean up by truncating any verity + * metadata that was written beyond i_size (both from cache and from + * disk) and clearing FI_VERITY_IN_PROGRESS. + * + * Taking i_gc_rwsem[WRITE] is needed to stop f2fs garbage collection + * from re-instantiating cached pages we are truncating (since unlike + * normal file accesses, garbage collection isn't limited by i_size). + */ + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + truncate_inode_pages(inode->i_mapping, inode->i_size); + err2 = f2fs_truncate(inode); + if (err2) { + f2fs_err(sbi, "Truncating verity metadata failed (errno=%d)", + err2); + set_sbi_flag(sbi, SBI_NEED_FSCK); } - return err; + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + clear_inode_flag(inode, FI_VERITY_IN_PROGRESS); + return err ?: err2; } static int f2fs_get_verity_descriptor(struct inode *inode, void *buf, From 1c579e4924504d645f101b75059a798ecf4dd1ce Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 1 Mar 2021 17:28:16 -0800 Subject: [PATCH 206/580] f2fs: expose # of overprivision segments This is useful when checking conditions during checkpoint=disable in Android. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 5 +++++ fs/f2fs/sysfs.c | 9 +++++++++ 2 files changed, 14 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 4ada9899ad84..054957872dfa 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -389,3 +389,8 @@ Description: Give a way to change checkpoint merge daemon's io priority. I/O priority "3". We can select the class between "rt" and "be", and set the I/O priority within valid range of it. "," delimiter is necessary in between I/O class and priority number. + +What: /sys/fs/f2fs//ovp_segments +Date: March 2021 +Contact: "Jaegeuk Kim" +Description: Shows the number of overprovision segments. diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index dc2b102f3482..2a55f28ca329 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -91,6 +91,13 @@ static ssize_t free_segments_show(struct f2fs_attr *a, (unsigned long long)(free_segments(sbi))); } +static ssize_t ovp_segments_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sprintf(buf, "%llu\n", + (unsigned long long)(overprovision_segments(sbi))); +} + static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -629,6 +636,7 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag); F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio); F2FS_GENERAL_RO_ATTR(dirty_segments); F2FS_GENERAL_RO_ATTR(free_segments); +F2FS_GENERAL_RO_ATTR(ovp_segments); F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); F2FS_GENERAL_RO_ATTR(features); F2FS_GENERAL_RO_ATTR(current_reserved_blocks); @@ -715,6 +723,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(ckpt_thread_ioprio), ATTR_LIST(dirty_segments), ATTR_LIST(free_segments), + ATTR_LIST(ovp_segments), ATTR_LIST(unusable), ATTR_LIST(lifetime_write_kbytes), ATTR_LIST(features), From 16486f24c3a060716a8b8bfc8b2a5be420961f62 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 5 Mar 2021 17:56:01 +0800 Subject: [PATCH 207/580] f2fs: fix to align to section for fallocate() on pinned file Now, fallocate() on a pinned file only allocates blocks which aligns to segment rather than section, so GC may try to migrate pinned file's block, and after several times of failure, pinned file's block could be migrated to other place, however user won't be aware of such condition, and then old obsolete block address may be readed/written incorrectly. To avoid such condition, let's try to allocate pinned file's blocks with section alignment. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 19 +++++++++---------- fs/f2fs/segment.c | 34 ++++++++++++++++++++++++++-------- 3 files changed, 36 insertions(+), 19 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 65a6e02d581c..a633a916995d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3399,7 +3399,7 @@ void f2fs_get_new_segment(struct f2fs_sb_info *sbi, unsigned int *newseg, bool new_sec, int dir); void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, unsigned int start, unsigned int end); -void f2fs_allocate_new_segment(struct f2fs_sb_info *sbi, int type); +void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type); void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi); int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range); bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 9bb03379241b..bf2926cd9553 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1666,27 +1666,26 @@ static int expand_inode_data(struct inode *inode, loff_t offset, return 0; if (f2fs_is_pinned_file(inode)) { - block_t len = (map.m_len >> sbi->log_blocks_per_seg) << - sbi->log_blocks_per_seg; + block_t sec_blks = BLKS_PER_SEC(sbi); + block_t sec_len = roundup(map.m_len, sec_blks); block_t done = 0; - if (map.m_len % sbi->blocks_per_seg) - len += sbi->blocks_per_seg; - - map.m_len = sbi->blocks_per_seg; + map.m_len = sec_blks; next_alloc: if (has_not_enough_free_secs(sbi, 0, GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { down_write(&sbi->gc_lock); err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); - if (err && err != -ENODATA && err != -EAGAIN) + if (err && err != -ENODATA && err != -EAGAIN) { + map.m_len = done; goto out_err; + } } down_write(&sbi->pin_sem); f2fs_lock_op(sbi); - f2fs_allocate_new_segment(sbi, CURSEG_COLD_DATA_PINNED); + f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED); f2fs_unlock_op(sbi); map.m_seg_type = CURSEG_COLD_DATA_PINNED; @@ -1695,9 +1694,9 @@ static int expand_inode_data(struct inode *inode, loff_t offset, up_write(&sbi->pin_sem); done += map.m_len; - len -= map.m_len; + sec_len -= map.m_len; map.m_lblk += map.m_len; - if (!err && len) + if (!err && sec_len) goto next_alloc; map.m_len = done; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index cb4e25ab5109..1f2a2512b63b 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2898,7 +2898,8 @@ void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, up_read(&SM_I(sbi)->curseg_lock); } -static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type) +static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type, + bool new_sec) { struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int old_segno; @@ -2906,10 +2907,22 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type) if (!curseg->inited) goto alloc; - if (!curseg->next_blkoff && - !get_valid_blocks(sbi, curseg->segno, false) && - !get_ckpt_valid_blocks(sbi, curseg->segno)) - return; + if (curseg->next_blkoff || + get_valid_blocks(sbi, curseg->segno, new_sec)) + goto alloc; + + if (new_sec) { + unsigned int segno = START_SEGNO(curseg->segno); + int i; + + for (i = 0; i < sbi->segs_per_sec; i++, segno++) { + if (get_ckpt_valid_blocks(sbi, segno)) + goto alloc; + } + } else { + if (!get_ckpt_valid_blocks(sbi, curseg->segno)) + return; + } alloc: old_segno = curseg->segno; @@ -2917,10 +2930,15 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type) locate_dirty_segment(sbi, old_segno); } -void f2fs_allocate_new_segment(struct f2fs_sb_info *sbi, int type) +static void __allocate_new_section(struct f2fs_sb_info *sbi, int type) +{ + __allocate_new_segment(sbi, type, true); +} + +void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type) { down_write(&SIT_I(sbi)->sentry_lock); - __allocate_new_segment(sbi, type); + __allocate_new_section(sbi, type); up_write(&SIT_I(sbi)->sentry_lock); } @@ -2930,7 +2948,7 @@ void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) down_write(&SIT_I(sbi)->sentry_lock); for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) - __allocate_new_segment(sbi, i); + __allocate_new_segment(sbi, i, false); up_write(&SIT_I(sbi)->sentry_lock); } From 3d380094044d9119c53815f84c296128b43a5b1e Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Tue, 9 Mar 2021 13:21:18 +0800 Subject: [PATCH 208/580] f2fs: fix to use per-inode maxbytes in f2fs_fiemap F2FS inode may have different max size, so change to use per-inode maxbytes. Signed-off-by: Chengguang Xu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 62fe53ea75a3..777fb2121311 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1799,6 +1799,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int ret = 0; bool compr_cluster = false; unsigned int cluster_size = F2FS_I(inode)->i_cluster_size; + loff_t maxbytes; if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { ret = f2fs_precache_extents(inode); @@ -1812,6 +1813,15 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, inode_lock(inode); + maxbytes = max_file_blocks(inode) << F2FS_BLKSIZE_BITS; + if (start > maxbytes) { + ret = -EFBIG; + goto out; + } + + if (len > maxbytes || (maxbytes - len) < start) + len = maxbytes - start; + if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { ret = f2fs_xattr_fiemap(inode, fieinfo); goto out; From f059ea36d3a5e868da11011174421f427bf2009b Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Mon, 15 Mar 2021 17:12:33 +0900 Subject: [PATCH 209/580] f2fs: add sysfs nodes to get runtime compression stat I've added new sysfs nodes to show runtime compression stat since mount. compr_written_block - show the block count written after compression compr_saved_block - show the saved block count with compression compr_new_inode - show the count of inode newly enabled for compression Signed-off-by: Daeho Jeong Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 24 +++++++++++++++ fs/f2fs/compress.c | 1 + fs/f2fs/f2fs.h | 19 ++++++++++++ fs/f2fs/sysfs.c | 41 +++++++++++++++++++++++++ 4 files changed, 85 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 054957872dfa..e61cf7da8e40 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -394,3 +394,27 @@ What: /sys/fs/f2fs//ovp_segments Date: March 2021 Contact: "Jaegeuk Kim" Description: Shows the number of overprovision segments. + +What: /sys/fs/f2fs//compr_written_block +Date: March 2021 +Contact: "Daeho Jeong" +Description: Show the block count written after compression since mount. Note + that when the compressed blocks are deleted, this count doesn't + decrease. If you write "0" here, you can initialize + compr_written_block and compr_saved_block to "0". + +What: /sys/fs/f2fs//compr_saved_block +Date: March 2021 +Contact: "Daeho Jeong" +Description: Show the saved block count with compression since mount. Note + that when the compressed blocks are deleted, this count doesn't + decrease. If you write "0" here, you can initialize + compr_written_block and compr_saved_block to "0". + +What: /sys/fs/f2fs//compr_new_inode +Date: March 2021 +Contact: "Daeho Jeong" +Description: Show the count of inode newly enabled for compression since mount. + Note that when the compression is disabled for the files, this count + doesn't decrease. If you write "0" here, you can initialize + compr_new_inode to "0". diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index f2918a62989c..3a38e0e2840b 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1324,6 +1324,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, if (fio.compr_blocks) f2fs_i_compr_blocks_update(inode, fio.compr_blocks - 1, false); f2fs_i_compr_blocks_update(inode, cc->nr_cpages, true); + add_compr_block_stat(inode, cc->nr_cpages); set_inode_flag(cc->inode, FI_APPEND_WRITE); if (cc->cluster_idx == 0) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a633a916995d..55eb0adba4c3 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1633,6 +1633,11 @@ struct f2fs_sb_info { #ifdef CONFIG_F2FS_FS_COMPRESSION struct kmem_cache *page_array_slab; /* page array entry */ unsigned int page_array_slab_size; /* default page array slab size */ + + /* For runtime compression statistics */ + u64 compr_written_block; + u64 compr_saved_block; + u32 compr_new_inode; #endif }; @@ -3972,6 +3977,18 @@ int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi); void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi); int __init f2fs_init_compress_cache(void); void f2fs_destroy_compress_cache(void); +#define inc_compr_inode_stat(inode) \ + do { \ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); \ + sbi->compr_new_inode++; \ + } while (0) +#define add_compr_block_stat(inode, blocks) \ + do { \ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); \ + int diff = F2FS_I(inode)->i_cluster_size - blocks; \ + sbi->compr_written_block += blocks; \ + sbi->compr_saved_block += diff; \ + } while (0) #else static inline bool f2fs_is_compressed_page(struct page *page) { return false; } static inline bool f2fs_is_compress_backend_ready(struct inode *inode) @@ -4000,6 +4017,7 @@ static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return static inline void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { } static inline int __init f2fs_init_compress_cache(void) { return 0; } static inline void f2fs_destroy_compress_cache(void) { } +#define inc_compr_inode_stat(inode) do { } while (0) #endif static inline void set_compress_context(struct inode *inode) @@ -4023,6 +4041,7 @@ static inline void set_compress_context(struct inode *inode) F2FS_I(inode)->i_flags |= F2FS_COMPR_FL; set_inode_flag(inode, FI_COMPRESSED_FILE); stat_inc_compr_inode(inode); + inc_compr_inode_stat(inode); f2fs_mark_inode_dirty_sync(inode, true); } diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 2a55f28ca329..27488290d935 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "f2fs.h" #include "segment.h" @@ -289,6 +290,20 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, return len; } +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (!strcmp(a->attr.name, "compr_written_block")) + return snprintf(buf, PAGE_SIZE, "%llu\n", + sbi->compr_written_block); + + if (!strcmp(a->attr.name, "compr_saved_block")) + return snprintf(buf, PAGE_SIZE, "%llu\n", + sbi->compr_saved_block); + + if (!strcmp(a->attr.name, "compr_new_inode")) + return snprintf(buf, PAGE_SIZE, "%u\n", + sbi->compr_new_inode); +#endif + ui = (unsigned int *)(ptr + a->offset); return sprintf(buf, "%u\n", *ui); @@ -465,6 +480,24 @@ static ssize_t __sbi_store(struct f2fs_attr *a, return count; } +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (!strcmp(a->attr.name, "compr_written_block") || + !strcmp(a->attr.name, "compr_saved_block")) { + if (t != 0) + return -EINVAL; + sbi->compr_written_block = 0; + sbi->compr_saved_block = 0; + return count; + } + + if (!strcmp(a->attr.name, "compr_new_inode")) { + if (t != 0) + return -EINVAL; + sbi->compr_new_inode = 0; + return count; + } +#endif + *ui = (unsigned int)t; return count; @@ -676,6 +709,9 @@ F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM); F2FS_FEATURE_RO_ATTR(casefold, FEAT_CASEFOLD); #ifdef CONFIG_F2FS_FS_COMPRESSION F2FS_FEATURE_RO_ATTR(compression, FEAT_COMPRESSION); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_written_block, compr_written_block); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_saved_block, compr_saved_block); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_new_inode, compr_new_inode); #endif #define ATTR_LIST(name) (&f2fs_attr_##name.attr) @@ -739,6 +775,11 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(moved_blocks_foreground), ATTR_LIST(moved_blocks_background), ATTR_LIST(avg_vblocks), +#endif +#ifdef CONFIG_F2FS_FS_COMPRESSION + ATTR_LIST(compr_written_block), + ATTR_LIST(compr_saved_block), + ATTR_LIST(compr_new_inode), #endif NULL, }; From b24b1b27ef525cdfb520eafed132e18bbe8edc05 Mon Sep 17 00:00:00 2001 From: Weichao Guo Date: Wed, 17 Mar 2021 17:27:23 +0800 Subject: [PATCH 210/580] f2fs: do not use AT_SSR mode in FG_GC & high urgent BG_GC AT_SSR mode is introduced by age threshold based GC for better hot/cold data seperation and avoiding free segment cost. However, LFS write mode is preferred in the scenario of foreground or high urgent GC, which should be finished ASAP. Let's only use AT_SSR in background GC and not high urgent GC modes. Signed-off-by: Weichao Guo Signed-off-by: Huang Jianan Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 3 ++- fs/f2fs/segment.c | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 86ba8ed0b8a7..d96acc6531f2 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1120,7 +1120,8 @@ static int move_data_block(struct inode *inode, block_t bidx, block_t newaddr; int err = 0; bool lfs_mode = f2fs_lfs_mode(fio.sbi); - int type = fio.sbi->am.atgc_enabled ? + int type = fio.sbi->am.atgc_enabled && (gc_type == BG_GC) && + (fio.sbi->gc_mode != GC_URGENT_HIGH) ? CURSEG_ALL_DATA_ATGC : CURSEG_COLD_DATA; /* do not read out */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1f2a2512b63b..ffc78be13438 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3262,7 +3262,9 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) struct inode *inode = fio->page->mapping->host; if (is_cold_data(fio->page)) { - if (fio->sbi->am.atgc_enabled) + if (fio->sbi->am.atgc_enabled && + (fio->io_type == FS_DATA_IO) && + (fio->sbi->gc_mode != GC_URGENT_HIGH)) return CURSEG_ALL_DATA_ATGC; else return CURSEG_COLD_DATA; From 11b35098d6328ec8f94433765e0acb185c4a31ff Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 17 Mar 2021 17:56:03 +0800 Subject: [PATCH 211/580] f2fs: don't start checkpoint thread in readonly mountpoint In readonly mountpoint, there should be no write IOs include checkpoint IO, so that it's not needed to create kernel checkpoint thread. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a2e9130d63f2..e2f001314800 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2056,8 +2056,10 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } } - if (!test_opt(sbi, DISABLE_CHECKPOINT) && - test_opt(sbi, MERGE_CHECKPOINT)) { + if ((*flags & SB_RDONLY) || test_opt(sbi, DISABLE_CHECKPOINT) || + !test_opt(sbi, MERGE_CHECKPOINT)) { + f2fs_stop_ckpt_thread(sbi); + } else { err = f2fs_start_ckpt_thread(sbi); if (err) { f2fs_err(sbi, @@ -2065,8 +2067,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) err); goto restore_gc; } - } else { - f2fs_stop_ckpt_thread(sbi); } /* @@ -3795,7 +3795,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) /* setup checkpoint request control and start checkpoint issue thread */ f2fs_init_ckpt_req_control(sbi); - if (!test_opt(sbi, DISABLE_CHECKPOINT) && + if (!f2fs_readonly(sb) && !test_opt(sbi, DISABLE_CHECKPOINT) && test_opt(sbi, MERGE_CHECKPOINT)) { err = f2fs_start_ckpt_thread(sbi); if (err) { From b9ded1bb5b105a78646e43203c0f63259792e53e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 22 Mar 2021 19:47:30 +0800 Subject: [PATCH 212/580] f2fs: fix to avoid out-of-bounds memory access butt3rflyh4ck reported a bug found by syzkaller fuzzer with custom modifications in 5.12.0-rc3+ [1]: dump_stack+0xfa/0x151 lib/dump_stack.c:120 print_address_description.constprop.0.cold+0x82/0x32c mm/kasan/report.c:232 __kasan_report mm/kasan/report.c:399 [inline] kasan_report.cold+0x7c/0xd8 mm/kasan/report.c:416 f2fs_test_bit fs/f2fs/f2fs.h:2572 [inline] current_nat_addr fs/f2fs/node.h:213 [inline] get_next_nat_page fs/f2fs/node.c:123 [inline] __flush_nat_entry_set fs/f2fs/node.c:2888 [inline] f2fs_flush_nat_entries+0x258e/0x2960 fs/f2fs/node.c:2991 f2fs_write_checkpoint+0x1372/0x6a70 fs/f2fs/checkpoint.c:1640 f2fs_issue_checkpoint+0x149/0x410 fs/f2fs/checkpoint.c:1807 f2fs_sync_fs+0x20f/0x420 fs/f2fs/super.c:1454 __sync_filesystem fs/sync.c:39 [inline] sync_filesystem fs/sync.c:67 [inline] sync_filesystem+0x1b5/0x260 fs/sync.c:48 generic_shutdown_super+0x70/0x370 fs/super.c:448 kill_block_super+0x97/0xf0 fs/super.c:1394 The root cause is, if nat entry in checkpoint journal area is corrupted, e.g. nid of journalled nat entry exceeds max nid value, during checkpoint, once it tries to flush nat journal to NAT area, get_next_nat_page() may access out-of-bounds memory on nat_bitmap due to it uses wrong nid value as bitmap offset. [1] https://lore.kernel.org/lkml/CAFcO6XOMWdr8pObek6eN6-fs58KG9doRFadgJj-FnF-1x43s2g@mail.gmail.com/T/#u Reported-and-tested-by: butt3rflyh4ck Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 0137878cb394..e617922a3c72 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2785,6 +2785,9 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) struct f2fs_nat_entry raw_ne; nid_t nid = le32_to_cpu(nid_in_journal(journal, i)); + if (f2fs_check_nid_range(sbi, nid)) + continue; + raw_ne = nat_in_journal(journal, i); ne = __lookup_nat_cache(nm_i, nid); From f2b4738613482212a5cc97c250e1542345b97412 Mon Sep 17 00:00:00 2001 From: qiulaibin Date: Tue, 23 Mar 2021 19:41:30 +0800 Subject: [PATCH 213/580] f2fs: fix wrong comment of nat_tree_lock Do trivial comment fix of nat_tree_lock. Signed-off-by: qiulaibin Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 55eb0adba4c3..70edc4bcb894 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -865,7 +865,7 @@ struct f2fs_nm_info { /* NAT cache management */ struct radix_tree_root nat_root;/* root of the nat entry cache */ struct radix_tree_root nat_set_root;/* root of the nat set cache */ - struct rw_semaphore nat_tree_lock; /* protect nat_tree_lock */ + struct rw_semaphore nat_tree_lock; /* protect nat entry tree */ struct list_head nat_entries; /* cached nat entry list (clean) */ spinlock_t nat_list_lock; /* protect clean nat entry list */ unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */ From c5b622a7c1d7827a8409743f0106f7fc7a0202f6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 17 Mar 2021 17:56:04 +0800 Subject: [PATCH 214/580] f2fs: fix error path of f2fs_remount() In error path of f2fs_remount(), it missed to restart/stop kernel thread or enable/disable checkpoint, then mount option status may not be consistent with real condition of filesystem, so let's reorder remount flow a bit as below and do recovery correctly in error path: 1) handle gc thread 2) handle ckpt thread 3) handle flush thread 4) handle checkpoint disabling Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 47 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e2f001314800..7a81e8d77350 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1914,8 +1914,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) struct f2fs_mount_info org_mount_opt; unsigned long old_sb_flags; int err; - bool need_restart_gc = false; - bool need_stop_gc = false; + bool need_restart_gc = false, need_stop_gc = false; + bool need_restart_ckpt = false, need_stop_ckpt = false; + bool need_restart_flush = false, need_stop_flush = false; bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); bool disable_checkpoint = test_opt(sbi, DISABLE_CHECKPOINT); bool no_io_align = !F2FS_IO_ALIGNED(sbi); @@ -2046,19 +2047,10 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) clear_sbi_flag(sbi, SBI_IS_CLOSE); } - if (checkpoint_changed) { - if (test_opt(sbi, DISABLE_CHECKPOINT)) { - err = f2fs_disable_checkpoint(sbi); - if (err) - goto restore_gc; - } else { - f2fs_enable_checkpoint(sbi); - } - } - if ((*flags & SB_RDONLY) || test_opt(sbi, DISABLE_CHECKPOINT) || !test_opt(sbi, MERGE_CHECKPOINT)) { f2fs_stop_ckpt_thread(sbi); + need_restart_ckpt = true; } else { err = f2fs_start_ckpt_thread(sbi); if (err) { @@ -2067,6 +2059,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) err); goto restore_gc; } + need_stop_ckpt = true; } /* @@ -2076,11 +2069,24 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if ((*flags & SB_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { clear_opt(sbi, FLUSH_MERGE); f2fs_destroy_flush_cmd_control(sbi, false); + need_restart_flush = true; } else { err = f2fs_create_flush_cmd_control(sbi); if (err) - goto restore_gc; + goto restore_ckpt; + need_stop_flush = true; } + + if (checkpoint_changed) { + if (test_opt(sbi, DISABLE_CHECKPOINT)) { + err = f2fs_disable_checkpoint(sbi); + if (err) + goto restore_flush; + } else { + f2fs_enable_checkpoint(sbi); + } + } + skip: #ifdef CONFIG_QUOTA /* Release old quota file names */ @@ -2095,6 +2101,21 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) adjust_unusable_cap_perc(sbi); *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); return 0; +restore_flush: + if (need_restart_flush) { + if (f2fs_create_flush_cmd_control(sbi)) + f2fs_warn(sbi, "background flush thread has stopped"); + } else if (need_stop_flush) { + clear_opt(sbi, FLUSH_MERGE); + f2fs_destroy_flush_cmd_control(sbi, false); + } +restore_ckpt: + if (need_restart_ckpt) { + if (f2fs_start_ckpt_thread(sbi)) + f2fs_warn(sbi, "background ckpt thread has stopped"); + } else if (need_stop_ckpt) { + f2fs_stop_ckpt_thread(sbi); + } restore_gc: if (need_restart_gc) { if (f2fs_start_gc_thread(sbi)) From 2b724d3a616827752d3a95e2a72444e39c8b21f7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 24 Mar 2021 11:24:33 +0800 Subject: [PATCH 215/580] f2fs: fix to update last i_size if fallocate partially succeeds In the case of expanding pinned file, map.m_lblk and map.m_len will update in each round of section allocation, so in error path, last i_size will be calculated with wrong m_lblk and m_len, fix it. Fixes: f5a53edcf01e ("f2fs: support aligned pinned file") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index bf2926cd9553..8d4dc5bfad7d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1639,9 +1639,10 @@ static int expand_inode_data(struct inode *inode, loff_t offset, struct f2fs_map_blocks map = { .m_next_pgofs = NULL, .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE, .m_may_create = true }; - pgoff_t pg_end; + pgoff_t pg_start, pg_end; loff_t new_size = i_size_read(inode); loff_t off_end; + block_t expanded = 0; int err; err = inode_newsize_ok(inode, (len + offset)); @@ -1654,11 +1655,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset, f2fs_balance_fs(sbi, true); + pg_start = ((unsigned long long)offset) >> PAGE_SHIFT; pg_end = ((unsigned long long)offset + len) >> PAGE_SHIFT; off_end = (offset + len) & (PAGE_SIZE - 1); - map.m_lblk = ((unsigned long long)offset) >> PAGE_SHIFT; - map.m_len = pg_end - map.m_lblk; + map.m_lblk = pg_start; + map.m_len = pg_end - pg_start; if (off_end) map.m_len++; @@ -1668,7 +1670,6 @@ static int expand_inode_data(struct inode *inode, loff_t offset, if (f2fs_is_pinned_file(inode)) { block_t sec_blks = BLKS_PER_SEC(sbi); block_t sec_len = roundup(map.m_len, sec_blks); - block_t done = 0; map.m_len = sec_blks; next_alloc: @@ -1676,10 +1677,8 @@ static int expand_inode_data(struct inode *inode, loff_t offset, GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { down_write(&sbi->gc_lock); err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); - if (err && err != -ENODATA && err != -EAGAIN) { - map.m_len = done; + if (err && err != -ENODATA && err != -EAGAIN) goto out_err; - } } down_write(&sbi->pin_sem); @@ -1693,24 +1692,25 @@ static int expand_inode_data(struct inode *inode, loff_t offset, up_write(&sbi->pin_sem); - done += map.m_len; + expanded += map.m_len; sec_len -= map.m_len; map.m_lblk += map.m_len; if (!err && sec_len) goto next_alloc; - map.m_len = done; + map.m_len = expanded; } else { err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + expanded = map.m_len; } out_err: if (err) { pgoff_t last_off; - if (!map.m_len) + if (!expanded) return err; - last_off = map.m_lblk + map.m_len - 1; + last_off = pg_start + expanded - 1; /* update new size to the failed position */ new_size = (last_off == pg_end) ? offset + len : From dbd7afe9d57882da663d6a9f7fcb75272b873ada Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 24 Mar 2021 11:18:28 +0800 Subject: [PATCH 216/580] f2fs: fix to avoid touching checkpointed data in get_victim() In CP disabling mode, there are two issues when using LFS or SSR | AT_SSR mode to select victim: 1. LFS is set to find source section during GC, the victim should have no checkpointed data, since after GC, section could not be set free for reuse. Previously, we only check valid chpt blocks in current segment rather than section, fix it. 2. SSR | AT_SSR are set to find target segment for writes which can be fully filled by checkpointed and newly written blocks, we should never select such segment, otherwise it can cause panic or data corruption during allocation, potential case is described as below: a) target segment has 128 ckpt valid blocks b) GC migrates 'n' (n < 512) valid blocks to other segment (segment is still in dirty list) c) GC migrates '512 - n' blocks to target segment (segment has 'n' cp_vblocks and '512 - n' vblocks) d) If GC selects target segment via {AT,}SSR allocator, however there is no free space in targe segment. Fixes: 4354994f097d ("f2fs: checkpoint disabling") Fixes: 093749e296e2 ("f2fs: support age threshold based garbage collection") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/gc.c | 28 ++++++++++++++++++++-------- fs/f2fs/segment.c | 36 +++++++++++++++++++++--------------- fs/f2fs/segment.h | 14 +++++++++++++- 4 files changed, 55 insertions(+), 24 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 70edc4bcb894..236cf72204e4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3397,6 +3397,7 @@ block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi); int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable); void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi); int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); +bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno); void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi); void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi); void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d96acc6531f2..a2ca483f9855 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -392,10 +392,6 @@ static void add_victim_entry(struct f2fs_sb_info *sbi, if (p->gc_mode == GC_AT && get_valid_blocks(sbi, segno, true) == 0) return; - - if (p->alloc_mode == AT_SSR && - get_seg_entry(sbi, segno)->ckpt_valid_blocks == 0) - return; } for (i = 0; i < sbi->segs_per_sec; i++) @@ -728,11 +724,27 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, if (sec_usage_check(sbi, secno)) goto next; + /* Don't touch checkpointed data */ - if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) && - get_ckpt_valid_blocks(sbi, segno) && - p.alloc_mode == LFS)) - goto next; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + if (p.alloc_mode == LFS) { + /* + * LFS is set to find source section during GC. + * The victim should have no checkpointed data. + */ + if (get_ckpt_valid_blocks(sbi, segno, true)) + goto next; + } else { + /* + * SSR | AT_SSR are set to find target segment + * for writes which can be full by checkpointed + * and newly written blocks. + */ + if (!f2fs_segment_has_free_slot(sbi, segno)) + goto next; + } + } + if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) goto next; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ffc78be13438..c063529e6ac4 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -865,7 +865,7 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_lock(&dirty_i->seglist_lock); valid_blocks = get_valid_blocks(sbi, segno, false); - ckpt_valid_blocks = get_ckpt_valid_blocks(sbi, segno); + ckpt_valid_blocks = get_ckpt_valid_blocks(sbi, segno, false); if (valid_blocks == 0 && (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) || ckpt_valid_blocks == usable_blocks)) { @@ -950,7 +950,7 @@ static unsigned int get_free_segment(struct f2fs_sb_info *sbi) for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { if (get_valid_blocks(sbi, segno, false)) continue; - if (get_ckpt_valid_blocks(sbi, segno)) + if (get_ckpt_valid_blocks(sbi, segno, false)) continue; mutex_unlock(&dirty_i->seglist_lock); return segno; @@ -2641,6 +2641,23 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, seg->next_blkoff++; } +bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno) +{ + struct seg_entry *se = get_seg_entry(sbi, segno); + int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); + unsigned long *target_map = SIT_I(sbi)->tmp_map; + unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; + unsigned long *cur_map = (unsigned long *)se->cur_valid_map; + int i, pos; + + for (i = 0; i < entries; i++) + target_map[i] = ckpt_map[i] | cur_map[i]; + + pos = __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, 0); + + return pos < sbi->blocks_per_seg; +} + /* * This function always allocates a used segment(from dirty seglist) by SSR * manner, so it should recover the existing segment information of valid blocks @@ -2911,19 +2928,8 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type, get_valid_blocks(sbi, curseg->segno, new_sec)) goto alloc; - if (new_sec) { - unsigned int segno = START_SEGNO(curseg->segno); - int i; - - for (i = 0; i < sbi->segs_per_sec; i++, segno++) { - if (get_ckpt_valid_blocks(sbi, segno)) - goto alloc; - } - } else { - if (!get_ckpt_valid_blocks(sbi, curseg->segno)) - return; - } - + if (!get_ckpt_valid_blocks(sbi, curseg->segno, new_sec)) + return; alloc: old_segno = curseg->segno; SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 144980b62f9e..dab87ecba2b5 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -359,8 +359,20 @@ static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi, } static inline unsigned int get_ckpt_valid_blocks(struct f2fs_sb_info *sbi, - unsigned int segno) + unsigned int segno, bool use_section) { + if (use_section && __is_large_section(sbi)) { + unsigned int start_segno = START_SEGNO(segno); + unsigned int blocks = 0; + int i; + + for (i = 0; i < sbi->segs_per_sec; i++, start_segno++) { + struct seg_entry *se = get_seg_entry(sbi, start_segno); + + blocks += se->ckpt_valid_blocks; + } + return blocks; + } return get_seg_entry(sbi, segno)->ckpt_valid_blocks; } From e5fc650e0dd4470b71a954adc56af66badbe539a Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Tue, 16 Mar 2021 14:59:18 +0530 Subject: [PATCH 217/580] f2fs: allow to change discard policy based on cached discard cmds With the default DPOLICY_BG discard thread is ioaware, which prevents the discard thread from issuing the discard commands. On low RAM setups, it is observed that these discard commands in the cache are consuming high memory. This patch aims to relax the memory pressure on the system due to f2fs pending discard cmds by changing the policy to DPOLICY_FORCE based on the nm_i->ram_thresh configured. Signed-off-by: Sahitya Tummala Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 8 ++++++++ fs/f2fs/node.h | 1 + fs/f2fs/segment.c | 3 ++- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index e617922a3c72..a571079cc4b0 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -43,11 +43,15 @@ int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) { struct f2fs_nm_info *nm_i = NM_I(sbi); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct sysinfo val; unsigned long avail_ram; unsigned long mem_size = 0; bool res = false; + if (!nm_i) + return true; + si_meminfo(&val); /* only uses low memory */ @@ -89,6 +93,10 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) /* it allows 20% / total_ram for inmemory pages */ mem_size = get_pages(sbi, F2FS_INMEM_PAGES); res = mem_size < (val.totalram / 5); + } else if (type == DISCARD_CACHE) { + mem_size = (atomic_read(&dcc->discard_cmd_cnt) * + sizeof(struct discard_cmd)) >> PAGE_SHIFT; + res = mem_size < (avail_ram * nm_i->ram_thresh / 100); } else { if (!sbi->sb->s_bdi->wb.dirty_exceeded) return true; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index f84541b57acb..7a45c0f10629 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -147,6 +147,7 @@ enum mem_type { INO_ENTRIES, /* indicates inode entries */ EXTENT_CACHE, /* indicates extent cache */ INMEM_PAGES, /* indicates inmemory pages */ + DISCARD_CACHE, /* indicates memory of cached discard cmds */ BASE_CHECK, /* check kernel status */ }; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c063529e6ac4..0479360a5d4c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1762,7 +1762,8 @@ static int issue_discard_thread(void *data) if (!atomic_read(&dcc->discard_cmd_cnt)) continue; - if (sbi->gc_mode == GC_URGENT_HIGH) + if (sbi->gc_mode == GC_URGENT_HIGH || + !f2fs_available_free_memory(sbi, DISCARD_CACHE)) __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); sb_start_intwrite(sbi->sb); From 29c0c16eaf0382c60d1f330d160bd0950c18bffe Mon Sep 17 00:00:00 2001 From: Ruiqi Gong Date: Thu, 25 Mar 2021 02:38:11 -0400 Subject: [PATCH 218/580] f2fs: fix a typo in inode.c Do a trivial typo fix. s/runing/running Reported-by: Hulk Robot Signed-off-by: Ruiqi Gong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index a67705694687..20319c84b70e 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -698,7 +698,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) /* * We need to balance fs here to prevent from producing dirty node pages - * during the urgent cleaning time when runing out of free sections. + * during the urgent cleaning time when running out of free sections. */ f2fs_update_inode_page(inode); if (wbc && wbc->nr_to_write) From 66be0dbe265459c473a9f3f4f786351e5631f90b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 26 Mar 2021 22:41:43 +0800 Subject: [PATCH 219/580] f2fs: delete empty compress.h Commit 75e91c888989 ("f2fs: compress: fix compression chksum") wrongly introduced empty compress.h, delete it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.h | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 fs/f2fs/compress.h diff --git a/fs/f2fs/compress.h b/fs/f2fs/compress.h deleted file mode 100644 index e69de29bb2d1..000000000000 From 9b18ced41763f64875edb69ac84d5848d00735f7 Mon Sep 17 00:00:00 2001 From: Wang Xiaojun Date: Thu, 25 Mar 2021 10:19:20 -0400 Subject: [PATCH 220/580] f2fs: fix wrong alloc_type in f2fs_do_replace_block If the alloc_type of the original curseg is LFS, when we change_curseg and then do recover curseg, the alloc_type becomes SSR. Signed-off-by: Wang Xiaojun Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0479360a5d4c..ada0990bddb7 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3571,6 +3571,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, struct seg_entry *se; int type; unsigned short old_blkoff; + unsigned char old_alloc_type; segno = GET_SEGNO(sbi, new_blkaddr); se = get_seg_entry(sbi, segno); @@ -3604,6 +3605,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, old_cursegno = curseg->segno; old_blkoff = curseg->next_blkoff; + old_alloc_type = curseg->alloc_type; /* change the current segment */ if (segno != curseg->segno) { @@ -3638,6 +3640,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, change_curseg(sbi, type, true); } curseg->next_blkoff = old_blkoff; + curseg->alloc_type = old_alloc_type; } up_write(&sit_i->sentry_lock); From 58ec18d30a29ebb0c94433b12c2f0dafc3b1ae46 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 26 Mar 2021 09:46:22 +0800 Subject: [PATCH 221/580] f2fs: fix to cover __allocate_new_section() with curseg_lock In order to avoid race with f2fs_do_replace_block(). Fixes: f5a53edcf01e ("f2fs: support aligned pinned file") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ada0990bddb7..cf5c8d538ff9 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2944,19 +2944,23 @@ static void __allocate_new_section(struct f2fs_sb_info *sbi, int type) void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type) { + down_read(&SM_I(sbi)->curseg_lock); down_write(&SIT_I(sbi)->sentry_lock); __allocate_new_section(sbi, type); up_write(&SIT_I(sbi)->sentry_lock); + up_read(&SM_I(sbi)->curseg_lock); } void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) { int i; + down_read(&SM_I(sbi)->curseg_lock); down_write(&SIT_I(sbi)->sentry_lock); for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) __allocate_new_segment(sbi, i, false); up_write(&SIT_I(sbi)->sentry_lock); + up_read(&SM_I(sbi)->curseg_lock); } static const struct segment_allocation default_salloc_ops = { From d0ad620e587f476ceebf8d8f31013c3e27c2201c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 27 Mar 2021 17:57:06 +0800 Subject: [PATCH 222/580] f2fs: introduce gc_merge mount option In this patch, we will add two new mount options: "gc_merge" and "nogc_merge", when background_gc is on, "gc_merge" option can be set to let background GC thread to handle foreground GC requests, it can eliminate the sluggish issue caused by slow foreground GC operation when GC is triggered from a process with limited I/O and CPU resources. Original idea is from Xiang. Signed-off-by: Gao Xiang Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 6 ++++++ fs/f2fs/f2fs.h | 1 + fs/f2fs/gc.c | 26 ++++++++++++++++++++++---- fs/f2fs/gc.h | 6 ++++++ fs/f2fs/segment.c | 15 +++++++++++++-- fs/f2fs/super.c | 19 +++++++++++++++++-- 6 files changed, 65 insertions(+), 8 deletions(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 35ed01a5fbc9..63c0c49b726d 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -110,6 +110,12 @@ background_gc=%s Turn on/off cleaning operations, namely garbage on synchronous garbage collection running in background. Default value for this option is on. So garbage collection is on by default. +gc_merge When background_gc is on, this option can be enabled to + let background GC thread to handle foreground GC requests, + it can eliminate the sluggish issue caused by slow foreground + GC operation when GC is triggered from a process with limited + I/O and CPU resources. +nogc_merge Disable GC merge feature. disable_roll_forward Disable the roll-forward recovery routine norecovery Disable the roll-forward recovery routine, mounted read- only (i.e., -o ro,disable_roll_forward) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 236cf72204e4..8bb0ecba09b0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -96,6 +96,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_NORECOVERY 0x04000000 #define F2FS_MOUNT_ATGC 0x08000000 #define F2FS_MOUNT_MERGE_CHECKPOINT 0x10000000 +#define F2FS_MOUNT_GC_MERGE 0x20000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index a2ca483f9855..5c48825fd12d 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -31,19 +31,24 @@ static int gc_thread_func(void *data) struct f2fs_sb_info *sbi = data; struct f2fs_gc_kthread *gc_th = sbi->gc_thread; wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; + wait_queue_head_t *fggc_wq = &sbi->gc_thread->fggc_wq; unsigned int wait_ms; wait_ms = gc_th->min_sleep_time; set_freezable(); do { - bool sync_mode; + bool sync_mode, foreground = false; wait_event_interruptible_timeout(*wq, kthread_should_stop() || freezing(current) || + waitqueue_active(fggc_wq) || gc_th->gc_wake, msecs_to_jiffies(wait_ms)); + if (test_opt(sbi, GC_MERGE) && waitqueue_active(fggc_wq)) + foreground = true; + /* give it a try one time */ if (gc_th->gc_wake) gc_th->gc_wake = 0; @@ -90,7 +95,10 @@ static int gc_thread_func(void *data) goto do_gc; } - if (!down_write_trylock(&sbi->gc_lock)) { + if (foreground) { + down_write(&sbi->gc_lock); + goto do_gc; + } else if (!down_write_trylock(&sbi->gc_lock)) { stat_other_skip_bggc_count(sbi); goto next; } @@ -107,14 +115,22 @@ static int gc_thread_func(void *data) else increase_sleep_time(gc_th, &wait_ms); do_gc: - stat_inc_bggc_count(sbi->stat_info); + if (!foreground) + stat_inc_bggc_count(sbi->stat_info); sync_mode = F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC; + /* foreground GC was been triggered via f2fs_balance_fs() */ + if (foreground) + sync_mode = false; + /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, sync_mode, true, false, NULL_SEGNO)) + if (f2fs_gc(sbi, sync_mode, !foreground, false, NULL_SEGNO)) wait_ms = gc_th->no_gc_sleep_time; + if (foreground) + wake_up_all(&gc_th->fggc_wq); + trace_f2fs_background_gc(sbi->sb, wait_ms, prefree_segments(sbi), free_segments(sbi)); @@ -148,6 +164,7 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) sbi->gc_thread = gc_th; init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); + init_waitqueue_head(&sbi->gc_thread->fggc_wq); sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(gc_th->f2fs_gc_task)) { @@ -165,6 +182,7 @@ void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi) if (!gc_th) return; kthread_stop(gc_th->f2fs_gc_task); + wake_up_all(&gc_th->fggc_wq); kfree(gc_th); sbi->gc_thread = NULL; } diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 0c8dae12dc51..3fe145e8e594 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -42,6 +42,12 @@ struct f2fs_gc_kthread { /* for changing gc mode */ unsigned int gc_wake; + + /* for GC_MERGE mount option */ + wait_queue_head_t fggc_wq; /* + * caller of f2fs_balance_fs() + * will wait on this wait queue. + */ }; struct gc_inode_list { diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index cf5c8d538ff9..b076823e7c61 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -503,8 +503,19 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) * dir/node pages without enough free segments. */ if (has_not_enough_free_secs(sbi, 0, 0)) { - down_write(&sbi->gc_lock); - f2fs_gc(sbi, false, false, false, NULL_SEGNO); + if (test_opt(sbi, GC_MERGE) && sbi->gc_thread && + sbi->gc_thread->f2fs_gc_task) { + DEFINE_WAIT(wait); + + prepare_to_wait(&sbi->gc_thread->fggc_wq, &wait, + TASK_UNINTERRUPTIBLE); + wake_up(&sbi->gc_thread->gc_wait_queue_head); + io_schedule(); + finish_wait(&sbi->gc_thread->fggc_wq, &wait); + } else { + down_write(&sbi->gc_lock); + f2fs_gc(sbi, false, false, false, NULL_SEGNO); + } } } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7a81e8d77350..9a41ac8a33a9 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -149,6 +149,8 @@ enum { Opt_compress_chksum, Opt_compress_mode, Opt_atgc, + Opt_gc_merge, + Opt_nogc_merge, Opt_err, }; @@ -220,6 +222,8 @@ static match_table_t f2fs_tokens = { {Opt_compress_chksum, "compress_chksum"}, {Opt_compress_mode, "compress_mode=%s"}, {Opt_atgc, "atgc"}, + {Opt_gc_merge, "gc_merge"}, + {Opt_nogc_merge, "nogc_merge"}, {Opt_err, NULL}, }; @@ -1055,6 +1059,12 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) case Opt_atgc: set_opt(sbi, ATGC); break; + case Opt_gc_merge: + set_opt(sbi, GC_MERGE); + break; + case Opt_nogc_merge: + clear_opt(sbi, GC_MERGE); + break; default: f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value", p); @@ -1667,6 +1677,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) else if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF) seq_printf(seq, ",background_gc=%s", "off"); + if (test_opt(sbi, GC_MERGE)) + seq_puts(seq, ",gc_merge"); + if (test_opt(sbi, DISABLE_ROLL_FORWARD)) seq_puts(seq, ",disable_roll_forward"); if (test_opt(sbi, NORECOVERY)) @@ -2025,7 +2038,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * option. Also sync the filesystem. */ if ((*flags & SB_RDONLY) || - F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF) { + (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF && + !test_opt(sbi, GC_MERGE))) { if (sbi->gc_thread) { f2fs_stop_gc_thread(sbi); need_restart_gc = true; @@ -3961,7 +3975,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) * If filesystem is not mounted as read-only then * do start the gc_thread. */ - if (F2FS_OPTION(sbi).bggc_mode != BGGC_MODE_OFF && !f2fs_readonly(sb)) { + if ((F2FS_OPTION(sbi).bggc_mode != BGGC_MODE_OFF || + test_opt(sbi, GC_MERGE)) && !f2fs_readonly(sb)) { /* After POR, we can run background GC thread.*/ err = f2fs_start_gc_thread(sbi); if (err) From 512076fb01fa89582311ab94fdf430e838bbdfca Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 31 Mar 2021 11:16:32 +0800 Subject: [PATCH 223/580] f2fs: fix to restrict mount condition on readonly block device When we mount an unclean f2fs image in a readonly block device, let's make mount() succeed only when there is no recoverable data in that image, otherwise after mount(), file fsyned won't be recovered as user expected. Fixes: 938a184265d7 ("f2fs: give a warning only for readonly partition") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 9a41ac8a33a9..6b59db7109f8 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3926,10 +3926,18 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) * previous checkpoint was not done by clean system shutdown. */ if (f2fs_hw_is_readonly(sbi)) { - if (!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) - f2fs_err(sbi, "Need to recover fsync data, but write access unavailable"); - else - f2fs_info(sbi, "write access unavailable, skipping recovery"); + if (!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { + err = f2fs_recover_fsync_data(sbi, true); + if (err > 0) { + err = -EROFS; + f2fs_err(sbi, "Need to recover fsync data, but " + "write access unavailable, please try " + "mount w/ disable_roll_forward or norecovery"); + } + if (err < 0) + goto free_meta; + } + f2fs_info(sbi, "write access unavailable, skipping recovery"); goto reset_checkpoint; } From b7ffb0d7d370785619e50acefa1b7cbafea491c3 Mon Sep 17 00:00:00 2001 From: Yi Zhuang Date: Wed, 31 Mar 2021 17:34:14 +0800 Subject: [PATCH 224/580] f2fs: Fix a hungtask problem in atomic write MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the cache writing process, if it is an atomic file, increase the page count of F2FS_WB_CP_DATA, otherwise increase the page count of F2FS_WB_DATA. When you step into the hook branch due to insufficient memory in f2fs_write_begin, f2fs_drop_inmem_pages_all will be called to traverse all atomic inodes and clear the FI_ATOMIC_FILE mark of all atomic files. In f2fs_drop_inmem_pages,first acquire the inmem_lock , revoke all the inmem_pages, and then clear the FI_ATOMIC_FILE mark. Before this mark is cleared, other threads may hold inmem_lock to add inmem_pages to the inode that has just been emptied inmem_pages, and increase the page count of F2FS_WB_CP_DATA. When the IO returns, it is found that the FI_ATOMIC_FILE flag is cleared by f2fs_drop_inmem_pages_all, and f2fs_is_atomic_file returns false,which causes the page count of F2FS_WB_DATA to be decremented. The page count of F2FS_WB_CP_DATA cannot be cleared. Finally, hungtask is triggered in f2fs_wait_on_all_pages because get_pages will never return zero. process A: process B: f2fs_drop_inmem_pages_all ->f2fs_drop_inmem_pages of inode#1 ->mutex_lock(&fi->inmem_lock) ->__revoke_inmem_pages of inode#1 f2fs_ioc_commit_atomic_write ->mutex_unlock(&fi->inmem_lock) ->f2fs_commit_inmem_pages of inode#1 ->mutex_lock(&fi->inmem_lock) ->__f2fs_commit_inmem_pages ->f2fs_do_write_data_page ->f2fs_outplace_write_data ->do_write_page ->f2fs_submit_page_write ->inc_page_count(sbi, F2FS_WB_CP_DATA ) ->mutex_unlock(&fi->inmem_lock) ->spin_lock(&sbi->inode_lock[ATOMIC_FILE]); ->clear_inode_flag(inode, FI_ATOMIC_FILE) ->spin_unlock(&sbi->inode_lock[ATOMIC_FILE]) f2fs_write_end_io ->dec_page_count(sbi, F2FS_WB_DATA ); We can fix the problem by putting the action of clearing the FI_ATOMIC_FILE mark into the inmem_lock lock. This operation can ensure that no one will submit the inmem pages before the FI_ATOMIC_FILE mark is cleared, so that there will be no atomic writes waiting for writeback. Fixes: 57864ae5ce3a ("f2fs: limit # of inmemory pages") Signed-off-by: Yi Zhuang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b076823e7c61..87ae92ead992 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -324,23 +324,27 @@ void f2fs_drop_inmem_pages(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); - while (!list_empty(&fi->inmem_pages)) { + do { mutex_lock(&fi->inmem_lock); + if (list_empty(&fi->inmem_pages)) { + fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; + + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (!list_empty(&fi->inmem_ilist)) + list_del_init(&fi->inmem_ilist); + if (f2fs_is_atomic_file(inode)) { + clear_inode_flag(inode, FI_ATOMIC_FILE); + sbi->atomic_files--; + } + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + + mutex_unlock(&fi->inmem_lock); + break; + } __revoke_inmem_pages(inode, &fi->inmem_pages, true, false, true); mutex_unlock(&fi->inmem_lock); - } - - fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; - - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (!list_empty(&fi->inmem_ilist)) - list_del_init(&fi->inmem_ilist); - if (f2fs_is_atomic_file(inode)) { - clear_inode_flag(inode, FI_ATOMIC_FILE); - sbi->atomic_files--; - } - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + } while (1); } void f2fs_drop_inmem_page(struct inode *inode, struct page *page) From 6409732ab1c0783f07e52dc80f590219035bac05 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 1 Apr 2021 17:25:20 -0700 Subject: [PATCH 225/580] f2fs: set checkpoint_merge by default Once we introduced checkpoint_merge, we've seen some contention w/o the option. In order to avoid it, let's set it by default. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 6b59db7109f8..4c4586ea2176 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1826,6 +1826,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, EXTENT_CACHE); set_opt(sbi, NOHEAP); clear_opt(sbi, DISABLE_CHECKPOINT); + set_opt(sbi, MERGE_CHECKPOINT); F2FS_OPTION(sbi).unusable_cap = 0; sbi->sb->s_flags |= SB_LAZYTIME; set_opt(sbi, FLUSH_MERGE); From 816d1e3d82e83749a05ce3e122f733c119e169ae Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 1 Apr 2021 11:01:53 +0800 Subject: [PATCH 226/580] f2fs: fix to avoid GC/mmap race with f2fs_truncate() It missed to hold i_gc_rwsem and i_map_sem around f2fs_truncate() in f2fs_file_write_iter() to avoid racing with background GC and mmap, fix it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 8d4dc5bfad7d..b09addca7f73 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4464,8 +4464,13 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) clear_inode_flag(inode, FI_NO_PREALLOC); /* if we couldn't write data, we should deallocate blocks. */ - if (preallocated && i_size_read(inode) < target_size) + if (preallocated && i_size_read(inode) < target_size) { + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); f2fs_truncate(inode); + up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + } if (ret > 0) f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); From eb1997a774569f3f31164f27c5fd732791345db9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 2 Apr 2021 17:22:23 +0800 Subject: [PATCH 227/580] f2fs: fix to avoid accessing invalid fio in f2fs_allocate_data_block() Callers may pass fio parameter with NULL value to f2fs_allocate_data_block(), so we should make sure accessing fio's field after fio's validation check. Fixes: f608c38c59c6 ("f2fs: clean up parameter of f2fs_allocate_data_block()") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 87ae92ead992..9c9175fb8b85 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3416,12 +3416,12 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, f2fs_inode_chksum_set(sbi, page); } - if (F2FS_IO_ALIGNED(sbi)) - fio->retry = false; - if (fio) { struct f2fs_bio_info *io; + if (F2FS_IO_ALIGNED(sbi)) + fio->retry = false; + INIT_LIST_HEAD(&fio->list); fio->in_list = true; io = sbi->write_io[fio->type] + fio->temp; From 4bb2c2571f9cf8bc0090786a951de507771e7bc5 Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Tue, 6 Apr 2021 14:39:16 +0530 Subject: [PATCH 228/580] f2fs: fix the periodic wakeups of discard thread Fix the unnecessary periodic wakeups of discard thread that happens under below two conditions - 1. When f2fs is heavily utilized over 80%, the current discard policy sets the max sleep timeout of discard thread as 50ms (DEF_MIN_DISCARD_ISSUE_TIME). But this is set even when there are no pending discard commands to be issued. 2. In the issue_discard_thread() path when there are no pending discard commands, it fails to reset the wait_ms to max timeout value. Signed-off-by: Sahitya Tummala Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9c9175fb8b85..5fea37195a8a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1114,6 +1114,8 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, int discard_type, unsigned int granularity) { + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + /* common policy */ dpolicy->type = discard_type; dpolicy->sync = true; @@ -1133,7 +1135,9 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->ordered = true; if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) { dpolicy->granularity = 1; - dpolicy->max_interval = DEF_MIN_DISCARD_ISSUE_TIME; + if (atomic_read(&dcc->discard_cmd_cnt)) + dpolicy->max_interval = + DEF_MIN_DISCARD_ISSUE_TIME; } } else if (discard_type == DPOLICY_FORCE) { dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; @@ -1749,8 +1753,15 @@ static int issue_discard_thread(void *data) set_freezable(); do { - __init_discard_policy(sbi, &dpolicy, DPOLICY_BG, - dcc->discard_granularity); + if (sbi->gc_mode == GC_URGENT_HIGH || + !f2fs_available_free_memory(sbi, DISCARD_CACHE)) + __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); + else + __init_discard_policy(sbi, &dpolicy, DPOLICY_BG, + dcc->discard_granularity); + + if (!atomic_read(&dcc->discard_cmd_cnt)) + wait_ms = dpolicy.max_interval; wait_event_interruptible_timeout(*q, kthread_should_stop() || freezing(current) || @@ -1777,10 +1788,6 @@ static int issue_discard_thread(void *data) if (!atomic_read(&dcc->discard_cmd_cnt)) continue; - if (sbi->gc_mode == GC_URGENT_HIGH || - !f2fs_available_free_memory(sbi, DISCARD_CACHE)) - __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); - sb_start_intwrite(sbi->sb); issued = __issue_discard_cmd(sbi, &dpolicy); From 86c2fb4194076d5d0bab3ec85b7cff9e75094ad1 Mon Sep 17 00:00:00 2001 From: Yi Zhuang Date: Tue, 6 Apr 2021 09:47:35 +0800 Subject: [PATCH 229/580] f2fs: clean up build warnings This patch combined the below three clean-up patches. - modify open brace '{' following function definitions - ERROR: spaces required around that ':' - ERROR: spaces required before the open parenthesis '(' - ERROR: spaces prohibited before that ',' - Made suggested modifications from checkpatch in reference to WARNING: Missing a blank line after declarations Signed-off-by: Yi Zhuang Signed-off-by: Jia Yang Signed-off-by: Jack Qiu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 1 + fs/f2fs/checkpoint.c | 1 + fs/f2fs/data.c | 2 ++ fs/f2fs/debug.c | 3 +++ fs/f2fs/dir.c | 1 + fs/f2fs/file.c | 2 +- fs/f2fs/gc.c | 6 +++++- fs/f2fs/inode.c | 1 + fs/f2fs/namei.c | 3 +++ fs/f2fs/node.c | 8 +++++--- fs/f2fs/recovery.c | 3 ++- fs/f2fs/segment.c | 11 ++++++++++- fs/f2fs/super.c | 5 +++-- fs/f2fs/xattr.c | 1 + 14 files changed, 39 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 732ec10e7890..b21b98f924e1 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -29,6 +29,7 @@ static inline size_t f2fs_acl_size(int count) static inline int f2fs_acl_count(size_t size) { ssize_t s; + size -= sizeof(struct f2fs_acl_header); s = size - 4 * sizeof(struct f2fs_acl_entry_short); if (s < 0) { diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 31f951e31dca..134f8ffb6ee3 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -719,6 +719,7 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) orphan_blk = (struct f2fs_orphan_block *)page_address(page); for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) { nid_t ino = le32_to_cpu(orphan_blk->ino[j]); + err = recover_orphan_inode(sbi, ino); if (err) { f2fs_put_page(page, 1); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 777fb2121311..f40313d77322 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1048,6 +1048,7 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) for (; count > 0; dn->ofs_in_node++) { block_t blkaddr = f2fs_data_blkaddr(dn); + if (blkaddr == NULL_ADDR) { dn->data_blkaddr = NEW_ADDR; __set_data_blkaddr(dn); @@ -3755,6 +3756,7 @@ int f2fs_migrate_page(struct address_space *mapping, if (atomic_written) { struct inmem_pages *cur; + list_for_each_entry(cur, &fi->inmem_pages, list) if (cur->page == page) { cur->page = newpage; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 91855d5721cd..c03949a7ccff 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -173,6 +173,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->util_invalid = 50 - si->util_free - si->util_valid; for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); + si->curseg[i] = curseg->segno; si->cursec[i] = GET_SEC_FROM_SEG(sbi, curseg->segno); si->curzone[i] = GET_ZONE_FROM_SEC(sbi, si->cursec[i]); @@ -300,10 +301,12 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->page_mem = 0; if (sbi->node_inode) { unsigned npages = NODE_MAPPING(sbi)->nrpages; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; } if (sbi->meta_inode) { unsigned npages = META_MAPPING(sbi)->nrpages; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; } } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 5f067e7e35e7..e412b26bd86b 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -473,6 +473,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, struct page *page, struct inode *inode) { enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA; + lock_page(page); f2fs_wait_on_page_writeback(page, type, true, true); de->ino = cpu_to_le32(inode->i_ino); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b09addca7f73..e088dea455af 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2606,7 +2606,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, { struct inode *inode = file_inode(filp); struct f2fs_map_blocks map = { .m_next_extent = NULL, - .m_seg_type = NO_CHECK_TYPE , + .m_seg_type = NO_CHECK_TYPE, .m_may_create = false }; struct extent_info ei = {0, 0, 0}; pgoff_t pg_start, pg_end, next_pgofs; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 5c48825fd12d..8d1f17ab94d8 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -160,7 +160,7 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME; gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME; - gc_th->gc_wake= 0; + gc_th->gc_wake = 0; sbi->gc_thread = gc_th; init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); @@ -179,6 +179,7 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi) { struct f2fs_gc_kthread *gc_th = sbi->gc_thread; + if (!gc_th) return; kthread_stop(gc_th->f2fs_gc_task); @@ -858,6 +859,7 @@ static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode) static void put_gc_inode(struct gc_inode_list *gc_list) { struct inode_entry *ie, *next_ie; + list_for_each_entry_safe(ie, next_ie, &gc_list->ilist, list) { radix_tree_delete(&gc_list->iroot, ie->inode->i_ino); iput(ie->inode); @@ -982,9 +984,11 @@ block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode) bidx = node_ofs - 1; } else if (node_ofs <= indirect_blks) { int dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1); + bidx = node_ofs - 2 - dec; } else { int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1); + bidx = node_ofs - 5 - dec; } return bidx * ADDRS_PER_BLOCK(inode) + ADDRS_PER_INODE(inode); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 20319c84b70e..71a354b9fe19 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -666,6 +666,7 @@ void f2fs_update_inode_page(struct inode *inode) node_page = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) { int err = PTR_ERR(node_page); + if (err == -ENOMEM) { cond_resched(); goto retry; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index db5d6c666ea2..fc6ebf22239b 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -414,6 +414,7 @@ struct dentry *f2fs_get_parent(struct dentry *child) struct qstr dotdot = QSTR_INIT("..", 2); struct page *page; unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot, &page); + if (!ino) { if (IS_ERR(page)) return ERR_CAST(page); @@ -623,6 +624,7 @@ static const char *f2fs_get_link(struct dentry *dentry, struct delayed_call *done) { const char *link = page_get_link(dentry, inode, done); + if (!IS_ERR(link) && !*link) { /* this is broken symlink case */ do_delayed_call(done); @@ -760,6 +762,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) static int f2fs_rmdir(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); + if (f2fs_empty_dir(inode)) return f2fs_unlink(dir, dentry); return -ENOTEMPTY; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index a571079cc4b0..a06eace2a4c8 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -470,6 +470,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, /* increment version no as node is removed */ if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) { unsigned char version = nat_get_version(e); + nat_set_version(e, inc_node_version(version)); } @@ -1391,7 +1392,7 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, goto out_err; } page_hit: - if(unlikely(nid != nid_of_node(page))) { + if (unlikely(nid != nid_of_node(page))) { f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", nid, nid_of_node(page), ino_of_node(page), ofs_of_node(page), cpver_of_node(page), @@ -1783,7 +1784,7 @@ int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, out: if (nwritten) f2fs_submit_merged_write_cond(sbi, NULL, NULL, ino, NODE); - return ret ? -EIO: 0; + return ret ? -EIO : 0; } static int f2fs_match_ino(struct inode *inode, unsigned long ino, void *data) @@ -2125,8 +2126,8 @@ static int __insert_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i) { struct f2fs_nm_info *nm_i = NM_I(sbi); - int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i); + if (err) return err; @@ -2991,6 +2992,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) while ((found = __gang_lookup_nat_set(nm_i, set_idx, SETVEC_SIZE, setvec))) { unsigned idx; + set_idx = setvec[found - 1]->set + 1; for (idx = 0; idx < found; idx++) __adjust_nat_entry_set(setvec[idx], &sets, diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index a80526f4fed0..8b1ef8834b93 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -458,6 +458,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, /* Get the previous summary */ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); + if (curseg->segno == segno) { sum = curseg->sum_blk->entries[blkoff]; goto got_it; @@ -859,5 +860,5 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) #endif sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ - return ret ? ret: err; + return ret ? ret : err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5fea37195a8a..6c5496ba137f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1794,7 +1794,7 @@ static int issue_discard_thread(void *data) if (issued > 0) { __wait_all_discard_cmd(sbi, &dpolicy); wait_ms = dpolicy.min_interval; - } else if (issued == -1){ + } else if (issued == -1) { wait_ms = f2fs_time_to_wait(sbi, DISCARD_TIME); if (!wait_ms) wait_ms = dpolicy.mid_interval; @@ -2170,6 +2170,7 @@ static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type, unsigned int segno, int modified) { struct seg_entry *se = get_seg_entry(sbi, segno); + se->type = type; if (modified) __mark_sit_entry_dirty(sbi, segno); @@ -2361,6 +2362,7 @@ static void __add_sum_entry(struct f2fs_sb_info *sbi, int type, { struct curseg_info *curseg = CURSEG_I(sbi, type); void *addr = curseg->sum_blk; + addr += curseg->next_blkoff * sizeof(struct f2fs_summary); memcpy(addr, sum, sizeof(struct f2fs_summary)); } @@ -3778,6 +3780,7 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi) for (j = 0; j < blk_off; j++) { struct f2fs_summary *s; + s = (struct f2fs_summary *)(kaddr + offset); seg_i->sum_blk->entries[j] = *s; offset += SUMMARY_SIZE; @@ -3840,6 +3843,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) if (__exist_node_summaries(sbi)) { struct f2fs_summary *ns = &sum->entries[0]; int i; + for (i = 0; i < sbi->blocks_per_seg; i++, ns++) { ns->version = 0; ns->ofs_in_node = 0; @@ -3941,6 +3945,7 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) /* Step 3: write summary entries */ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { unsigned short blkoff; + seg_i = CURSEG_I(sbi, i); if (sbi->ckpt->alloc_type[i] == SSR) blkoff = sbi->blocks_per_seg; @@ -3977,6 +3982,7 @@ static void write_normal_summaries(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { int i, end; + if (IS_DATASEG(type)) end = type + NR_CURSEG_DATA_TYPE; else @@ -4560,6 +4566,7 @@ static void init_free_segmap(struct f2fs_sb_info *sbi) /* set use the current segments */ for (type = CURSEG_HOT_DATA; type <= CURSEG_COLD_NODE; type++) { struct curseg_info *curseg_t = CURSEG_I(sbi, type); + __set_test_and_inuse(sbi, curseg_t->segno); } } @@ -4837,6 +4844,7 @@ static void discard_dirty_segmap(struct f2fs_sb_info *sbi, static void destroy_victim_secmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + kvfree(dirty_i->victim_secmap); } @@ -4881,6 +4889,7 @@ static void destroy_curseg(struct f2fs_sb_info *sbi) static void destroy_free_segmap(struct f2fs_sb_info *sbi) { struct free_segmap_info *free_i = SM_I(sbi)->free_info; + if (!free_i) return; SM_I(sbi)->free_info = NULL; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4c4586ea2176..982b24b8e522 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -556,6 +556,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) while ((p = strsep(&options, ",")) != NULL) { int token; + if (!*p) continue; /* @@ -1879,7 +1880,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) ret = sync_filesystem(sbi->sb); if (ret || err) { - err = ret ? ret: err; + err = ret ? ret : err; goto restore_flag; } @@ -3717,7 +3718,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sbi->iostat_period_ms = DEFAULT_IOSTAT_PERIOD_MS; for (i = 0; i < NR_PAGE_TYPE; i++) { - int n = (i == META) ? 1: NR_TEMP_TYPE; + int n = (i == META) ? 1 : NR_TEMP_TYPE; int j; sbi->write_io[i] = diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 8159fae74b9a..3d43b27bad95 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -486,6 +486,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, f2fs_wait_on_page_writeback(xpage, NODE, true, true); } else { struct dnode_of_data dn; + set_new_dnode(&dn, inode, NULL, NULL, new_nid); xpage = f2fs_new_node_page(&dn, XATTR_NODE_OFFSET); if (IS_ERR(xpage)) { From b52051c6e145dead8970b422a1fa17a68941cff1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 13 Apr 2021 17:56:53 +0800 Subject: [PATCH 230/580] f2fs: document: add description about compressed space handling User or developer may still be confused about why f2fs doesn't expose compressed space to userspace, add description about compressed space handling policy into f2fs documentation. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 63c0c49b726d..992bf91eeec8 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -819,6 +819,14 @@ Compression implementation * chattr +c file * chattr +c dir; touch dir/file * mount w/ -o compress_extension=ext; touch file.ext + * mount w/ -o compress_extension=*; touch any_file + +- At this point, compression feature doesn't expose compressed space to user + directly in order to guarantee potential data updates later to the space. + Instead, the main goal is to reduce data writes to flash disk as much as + possible, resulting in extending disk life time as well as relaxing IO + congestion. Alternatively, we've added ioctl interface to reclaim compressed + space and show it to user after putting the immutable bit. Compress metadata layout:: From f50b560d8a6cc32c52d4aacd7cd099752dd0c174 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 13 Apr 2021 17:56:18 +0800 Subject: [PATCH 231/580] f2fs: avoid duplicated codes for cleanup f2fs_segment_has_free_slot() was copied and modified from __next_free_blkoff(), they are almost the same, clean up to reuse common code as much as possible. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6c5496ba137f..74267121bce4 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2634,22 +2634,20 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) curseg->alloc_type = LFS; } -static void __next_free_blkoff(struct f2fs_sb_info *sbi, - struct curseg_info *seg, block_t start) +static int __next_free_blkoff(struct f2fs_sb_info *sbi, + int segno, block_t start) { - struct seg_entry *se = get_seg_entry(sbi, seg->segno); + struct seg_entry *se = get_seg_entry(sbi, segno); int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); unsigned long *target_map = SIT_I(sbi)->tmp_map; unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; unsigned long *cur_map = (unsigned long *)se->cur_valid_map; - int i, pos; + int i; for (i = 0; i < entries; i++) target_map[i] = ckpt_map[i] | cur_map[i]; - pos = __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start); - - seg->next_blkoff = pos; + return __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start); } /* @@ -2661,26 +2659,16 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, struct curseg_info *seg) { if (seg->alloc_type == SSR) - __next_free_blkoff(sbi, seg, seg->next_blkoff + 1); + seg->next_blkoff = + __next_free_blkoff(sbi, seg->segno, + seg->next_blkoff + 1); else seg->next_blkoff++; } bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno) { - struct seg_entry *se = get_seg_entry(sbi, segno); - int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); - unsigned long *target_map = SIT_I(sbi)->tmp_map; - unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; - unsigned long *cur_map = (unsigned long *)se->cur_valid_map; - int i, pos; - - for (i = 0; i < entries; i++) - target_map[i] = ckpt_map[i] | cur_map[i]; - - pos = __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, 0); - - return pos < sbi->blocks_per_seg; + return __next_free_blkoff(sbi, segno, 0) < sbi->blocks_per_seg; } /* @@ -2708,7 +2696,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush) reset_curseg(sbi, type, 1); curseg->alloc_type = SSR; - __next_free_blkoff(sbi, curseg, 0); + curseg->next_blkoff = __next_free_blkoff(sbi, curseg->segno, 0); sum_page = f2fs_get_sum_page(sbi, new_segno); if (IS_ERR(sum_page)) { From b117a9074ee21eec8edfd1ee5a041a031c54d483 Mon Sep 17 00:00:00 2001 From: Yi Chen Date: Tue, 13 Apr 2021 17:30:50 +0800 Subject: [PATCH 232/580] f2fs: fix to avoid NULL pointer dereference Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 pc : f2fs_put_page+0x1c/0x26c lr : __revoke_inmem_pages+0x544/0x75c f2fs_put_page+0x1c/0x26c __revoke_inmem_pages+0x544/0x75c __f2fs_commit_inmem_pages+0x364/0x3c0 f2fs_commit_inmem_pages+0xc8/0x1a0 f2fs_ioc_commit_atomic_write+0xa4/0x15c f2fs_ioctl+0x5b0/0x1574 file_ioctl+0x154/0x320 do_vfs_ioctl+0x164/0x740 __arm64_sys_ioctl+0x78/0xa4 el0_svc_common+0xbc/0x1d0 el0_svc_handler+0x74/0x98 el0_svc+0x8/0xc In f2fs_put_page, we access page->mapping is NULL. The root cause is: In some cases, the page refcount and ATOMIC_WRITTEN_PAGE flag miss set for page-priavte flag has been set. We add f2fs_bug_on like this: f2fs_register_inmem_page() { ... f2fs_set_page_private(page, ATOMIC_WRITTEN_PAGE); f2fs_bug_on(F2FS_I_SB(inode), !IS_ATOMIC_WRITTEN_PAGE(page)); ... } The bug on stack follow link this: PC is at f2fs_register_inmem_page+0x238/0x2b4 LR is at f2fs_register_inmem_page+0x2a8/0x2b4 f2fs_register_inmem_page+0x238/0x2b4 f2fs_set_data_page_dirty+0x104/0x164 set_page_dirty+0x78/0xc8 f2fs_write_end+0x1b4/0x444 generic_perform_write+0x144/0x1cc __generic_file_write_iter+0xc4/0x174 f2fs_file_write_iter+0x2c0/0x350 __vfs_write+0x104/0x134 vfs_write+0xe8/0x19c SyS_pwrite64+0x78/0xb8 To fix this issue, let's add page refcount add page-priavte flag. The page-private flag is not cleared and needs further analysis. Signed-off-by: Chao Yu Signed-off-by: Ge Qiu Signed-off-by: Dehe Gu Signed-off-by: Yi Chen Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 74267121bce4..d9834f6ab48e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -186,7 +186,10 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) { struct inmem_pages *new; - f2fs_set_page_private(page, ATOMIC_WRITTEN_PAGE); + if (PagePrivate(page)) + set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE); + else + f2fs_set_page_private(page, ATOMIC_WRITTEN_PAGE); new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); From f6498cc5f8312de2e1150d0389f075e3067a7625 Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Mon, 19 Apr 2021 10:20:03 +0800 Subject: [PATCH 233/580] f2fs: remove unnecessary struct declaration struct dnode_of_data is defined at 897th line. The declaration here is unnecessary. Remove it. Signed-off-by: Wan Jiabing Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8bb0ecba09b0..e69e9f7a02a0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3321,7 +3321,6 @@ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname); /* * node.c */ -struct dnode_of_data; struct node_info; int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid); From 133ef4e81073c2444c298421441855dfdcb17649 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 21 Apr 2021 09:54:55 +0800 Subject: [PATCH 234/580] f2fs: avoid using native allocate_segment_by_default() As we did for other cases, in fix_curseg_write_pointer(), let's use wrapped f2fs_allocate_new_section() instead of native allocate_segment_by_default(), by this way, it fixes to cover segment allocation with curseg_lock and sentry_lock. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 2 +- fs/f2fs/segment.c | 15 ++++++++------- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e69e9f7a02a0..eff774e6cf30 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3405,7 +3405,7 @@ void f2fs_get_new_segment(struct f2fs_sb_info *sbi, unsigned int *newseg, bool new_sec, int dir); void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, unsigned int start, unsigned int end); -void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type); +void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force); void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi); int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range); bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index e088dea455af..f48fc9e69dd7 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1684,7 +1684,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset, down_write(&sbi->pin_sem); f2fs_lock_op(sbi); - f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED); + f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false); f2fs_unlock_op(sbi); map.m_seg_type = CURSEG_COLD_DATA_PINNED; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d9834f6ab48e..86ea62422f37 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2932,7 +2932,7 @@ void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, } static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type, - bool new_sec) + bool new_sec, bool force) { struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int old_segno; @@ -2940,7 +2940,7 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type, if (!curseg->inited) goto alloc; - if (curseg->next_blkoff || + if (force || curseg->next_blkoff || get_valid_blocks(sbi, curseg->segno, new_sec)) goto alloc; @@ -2952,16 +2952,17 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type, locate_dirty_segment(sbi, old_segno); } -static void __allocate_new_section(struct f2fs_sb_info *sbi, int type) +static void __allocate_new_section(struct f2fs_sb_info *sbi, + int type, bool force) { - __allocate_new_segment(sbi, type, true); + __allocate_new_segment(sbi, type, true, force); } -void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type) +void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force) { down_read(&SM_I(sbi)->curseg_lock); down_write(&SIT_I(sbi)->sentry_lock); - __allocate_new_section(sbi, type); + __allocate_new_section(sbi, type, force); up_write(&SIT_I(sbi)->sentry_lock); up_read(&SM_I(sbi)->curseg_lock); } @@ -2973,7 +2974,7 @@ void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) down_read(&SM_I(sbi)->curseg_lock); down_write(&SIT_I(sbi)->sentry_lock); for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) - __allocate_new_segment(sbi, i, false); + __allocate_new_segment(sbi, i, false, false); up_write(&SIT_I(sbi)->sentry_lock); up_read(&SM_I(sbi)->curseg_lock); } From 5e1af4c66eef5bdff7fddf057f1151d5bb6fbbc5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 23 Apr 2021 14:09:38 +0800 Subject: [PATCH 235/580] f2fs: clean up left deprecated IO trace codes Commit d5f7bc0064e0 ("f2fs: deprecate f2fs_trace_io") left some dead codes, delete them. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 6 ------ fs/f2fs/f2fs.h | 8 -------- 2 files changed, 14 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 3a38e0e2840b..31faccd8c01e 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -77,12 +77,6 @@ bool f2fs_is_compressed_page(struct page *page) return false; if (IS_ATOMIC_WRITTEN_PAGE(page) || IS_DUMMY_WRITTEN_PAGE(page)) return false; - /* - * page->private may be set with pid. - * pid_max is enough to check if it is traced. - */ - if (IS_IO_TRACED_PAGE(page)) - return false; f2fs_bug_on(F2FS_M_SB(page->mapping), *((u32 *)page_private(page)) != F2FS_COMPRESSED_PAGE_MAGIC); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index eff774e6cf30..fe3b0abf3507 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1302,14 +1302,6 @@ enum { #define IS_DUMMY_WRITTEN_PAGE(page) \ (page_private(page) == DUMMY_WRITTEN_PAGE) -#ifdef CONFIG_F2FS_IO_TRACE -#define IS_IO_TRACED_PAGE(page) \ - (page_private(page) > 0 && \ - page_private(page) < (unsigned long)PID_MAX_LIMIT) -#else -#define IS_IO_TRACED_PAGE(page) (0) -#endif - #ifdef CONFIG_FS_ENCRYPTION #define DUMMY_ENCRYPTION_ENABLED(sbi) \ (unlikely(F2FS_OPTION(sbi).dummy_enc_ctx.ctx != NULL)) From d5c8390777f6b42bb825971f7dc86ef043bbf441 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 21 Apr 2021 16:39:41 +0800 Subject: [PATCH 236/580] f2fs: compress: remove unneed check condition In only call path of __cluster_may_compress(), __f2fs_write_data_pages() has checked SBI_POR_DOING condition, and also cluster_may_compress() has checked CP_ERROR_FLAG condition, so remove redundant check condition in __cluster_may_compress() for cleanup. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 31faccd8c01e..a7c79fb27d11 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -861,7 +861,6 @@ bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index) static bool __cluster_may_compress(struct compress_ctx *cc) { - struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); loff_t i_size = i_size_read(cc->inode); unsigned nr_pages = DIV_ROUND_UP(i_size, PAGE_SIZE); int i; @@ -869,12 +868,7 @@ static bool __cluster_may_compress(struct compress_ctx *cc) for (i = 0; i < cc->cluster_size; i++) { struct page *page = cc->rpages[i]; - f2fs_bug_on(sbi, !page); - - if (unlikely(f2fs_cp_error(sbi))) - return false; - if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) - return false; + f2fs_bug_on(F2FS_I_SB(cc->inode), !page); /* beyond EOF */ if (page->index >= nr_pages) From d2c40599b68636604ee6135a060032618c324c25 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 22 Apr 2021 18:19:25 +0800 Subject: [PATCH 237/580] f2fs: drop inplace IO if fs status is abnormal If filesystem has cp_error or need_fsck status, let's drop inplace IO to avoid further corruption of fs data. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 86ea62422f37..cfe8a210b49f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3551,7 +3551,13 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: incorrect segment(%u) type, run fsck to fix.", __func__, segno); - return -EFSCORRUPTED; + err = -EFSCORRUPTED; + goto drop_bio; + } + + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) || f2fs_cp_error(sbi)) { + err = -EIO; + goto drop_bio; } stat_inc_inplace_blocks(fio->sbi); @@ -3565,6 +3571,15 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); } + return err; +drop_bio: + if (fio->bio) { + struct bio *bio = *(fio->bio); + + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + fio->bio = NULL; + } return err; } From a4a0a425d666b0ccfd3815c19de437eb7afcaad6 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sun, 9 May 2021 21:53:03 -0700 Subject: [PATCH 238/580] f2fs: avoid null pointer access when handling IPU error Unable to handle kernel NULL pointer dereference at virtual address 000000000000001a pc : f2fs_inplace_write_data+0x144/0x208 lr : f2fs_inplace_write_data+0x134/0x208 Call trace: f2fs_inplace_write_data+0x144/0x208 f2fs_do_write_data_page+0x270/0x770 f2fs_write_single_data_page+0x47c/0x830 __f2fs_write_data_pages+0x444/0x98c f2fs_write_data_pages.llvm.16514453770497736882+0x2c/0x38 do_writepages+0x58/0x118 __writeback_single_inode+0x44/0x300 writeback_sb_inodes+0x4b8/0x9c8 wb_writeback+0x148/0x42c wb_do_writeback+0xc8/0x390 wb_workfn+0xb0/0x2f4 process_one_work+0x1fc/0x444 worker_thread+0x268/0x4b4 kthread+0x13c/0x158 ret_from_fork+0x10/0x18 Fixes: 955772787667 ("f2fs: drop inplace IO if fs status is abnormal") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index cfe8a210b49f..d1657c8547bd 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3573,12 +3573,12 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) return err; drop_bio: - if (fio->bio) { + if (fio->bio && *(fio->bio)) { struct bio *bio = *(fio->bio); bio->bi_status = BLK_STS_IOERR; bio_endio(bio); - fio->bio = NULL; + *(fio->bio) = NULL; } return err; } From e707194930cd7660b885c86a832e4284f7aa7d5d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 6 May 2021 12:11:14 -0700 Subject: [PATCH 239/580] f2fs: support iflag change given the mask In f2fs_fileattr_set(), if (!fa->flags_valid) mask &= FS_COMMON_FL; In this case, we can set supported flags by mask only instead of BUG_ON. /* Flags shared betwen flags/xflags */ (FS_SYNC_FL | FS_IMMUTABLE_FL | FS_APPEND_FL | \ FS_NODUMP_FL | FS_NOATIME_FL | FS_DAX_FL | \ FS_PROJINHERIT_FL) Fixes: 9b1bb01c8ae7 ("f2fs: convert to fileattr") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f48fc9e69dd7..ed3d919109ec 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1834,7 +1834,8 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) struct f2fs_inode_info *fi = F2FS_I(inode); u32 masked_flags = fi->i_flags & mask; - f2fs_bug_on(F2FS_I_SB(inode), (iflags & ~mask)); + /* mask can be shrunk by flags_valid selector */ + iflags &= mask; /* Is it quota file? Do not allow user to mess with it */ if (IS_NOQUOTA(inode)) From 86bdb5dcc866f21fd485341c0797bb65863caf78 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 6 May 2021 17:00:43 +0800 Subject: [PATCH 240/580] f2fs: compress: fix to free compress page correctly In error path of f2fs_write_compressed_pages(), it needs to call f2fs_compress_free_page() to release temporary page. Fixes: 5e6bbde95982 ("f2fs: introduce mempool for {,de}compress intermediate page allocation") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index a7c79fb27d11..29e6dbf0fc1f 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1343,7 +1343,8 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, for (i = 0; i < cc->nr_cpages; i++) { if (!cc->cpages[i]) continue; - f2fs_put_page(cc->cpages[i], 1); + f2fs_compress_free_page(cc->cpages[i]); + cc->cpages[i] = NULL; } out_put_cic: kmem_cache_free(cic_entry_slab, cic); From 0f006f8970cd49df284173f4d05c90c7fb21c287 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 10 May 2021 17:30:31 +0800 Subject: [PATCH 241/580] f2fs: compress: fix race condition of overwrite vs truncate pos_fsstress testcase complains a panic as belew: ------------[ cut here ]------------ kernel BUG at fs/f2fs/compress.c:1082! invalid opcode: 0000 [#1] SMP PTI CPU: 4 PID: 2753477 Comm: kworker/u16:2 Tainted: G OE 5.12.0-rc1-custom #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 Workqueue: writeback wb_workfn (flush-252:16) RIP: 0010:prepare_compress_overwrite+0x4c0/0x760 [f2fs] Call Trace: f2fs_prepare_compress_overwrite+0x5f/0x80 [f2fs] f2fs_write_cache_pages+0x468/0x8a0 [f2fs] f2fs_write_data_pages+0x2a4/0x2f0 [f2fs] do_writepages+0x38/0xc0 __writeback_single_inode+0x44/0x2a0 writeback_sb_inodes+0x223/0x4d0 __writeback_inodes_wb+0x56/0xf0 wb_writeback+0x1dd/0x290 wb_workfn+0x309/0x500 process_one_work+0x220/0x3c0 worker_thread+0x53/0x420 kthread+0x12f/0x150 ret_from_fork+0x22/0x30 The root cause is truncate() may race with overwrite as below, so that one reference count left in page can not guarantee the page attaching in mapping tree all the time, after truncation, later find_lock_page() may return NULL pointer. - prepare_compress_overwrite - f2fs_pagecache_get_page - unlock_page - f2fs_setattr - truncate_setsize - truncate_inode_page - delete_from_page_cache - find_lock_page Fix this by avoiding referencing updated page. Fixes: 4c8ff7095bef ("f2fs: support data compression") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 35 ++++++++++++----------------------- 1 file changed, 12 insertions(+), 23 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 29e6dbf0fc1f..9b9f692e426d 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -118,19 +118,6 @@ static void f2fs_unlock_rpages(struct compress_ctx *cc, int len) f2fs_drop_rpages(cc, len, true); } -static void f2fs_put_rpages_mapping(struct address_space *mapping, - pgoff_t start, int len) -{ - int i; - - for (i = 0; i < len; i++) { - struct page *page = find_get_page(mapping, start + i); - - put_page(page); - put_page(page); - } -} - static void f2fs_put_rpages_wbc(struct compress_ctx *cc, struct writeback_control *wbc, bool redirty, int unlock) { @@ -1007,7 +994,7 @@ static int prepare_compress_overwrite(struct compress_ctx *cc, } if (PageUptodate(page)) - unlock_page(page); + f2fs_put_page(page, 1); else f2fs_compress_ctx_add_page(cc, page); } @@ -1017,32 +1004,34 @@ static int prepare_compress_overwrite(struct compress_ctx *cc, ret = f2fs_read_multi_pages(cc, &bio, cc->cluster_size, &last_block_in_bio, false, true); + f2fs_put_rpages(cc); f2fs_destroy_compress_ctx(cc); if (ret) - goto release_pages; + goto out; if (bio) f2fs_submit_bio(sbi, bio, DATA); ret = f2fs_init_compress_ctx(cc); if (ret) - goto release_pages; + goto out; } for (i = 0; i < cc->cluster_size; i++) { f2fs_bug_on(sbi, cc->rpages[i]); page = find_lock_page(mapping, start_idx + i); - f2fs_bug_on(sbi, !page); + if (!page) { + /* page can be truncated */ + goto release_and_retry; + } f2fs_wait_on_page_writeback(page, DATA, true, true); - f2fs_compress_ctx_add_page(cc, page); - f2fs_put_page(page, 0); if (!PageUptodate(page)) { +release_and_retry: + f2fs_put_rpages(cc); f2fs_unlock_rpages(cc, i + 1); - f2fs_put_rpages_mapping(mapping, start_idx, - cc->cluster_size); f2fs_destroy_compress_ctx(cc); goto retry; } @@ -1074,10 +1063,10 @@ static int prepare_compress_overwrite(struct compress_ctx *cc, } unlock_pages: + f2fs_put_rpages(cc); f2fs_unlock_rpages(cc, i); -release_pages: - f2fs_put_rpages_mapping(mapping, start_idx, i); f2fs_destroy_compress_ctx(cc); +out: return ret; } From 5b13b5b5d92c9bf75b7edca87b8327af2b0cde55 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 10 May 2021 17:30:32 +0800 Subject: [PATCH 242/580] f2fs: compress: fix to assign cc.cluster_idx correctly In f2fs_destroy_compress_ctx(), after f2fs_destroy_compress_ctx(), cc.cluster_idx will be cleared w/ NULL_CLUSTER, f2fs_cluster_blocks() may check wrong cluster metadata, fix it. Fixes: 4c8ff7095bef ("f2fs: support data compression") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 17 +++++++++-------- fs/f2fs/data.c | 6 +++--- fs/f2fs/f2fs.h | 2 +- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 9b9f692e426d..968019134ad7 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -146,13 +146,14 @@ int f2fs_init_compress_ctx(struct compress_ctx *cc) return cc->rpages ? 0 : -ENOMEM; } -void f2fs_destroy_compress_ctx(struct compress_ctx *cc) +void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse) { page_array_free(cc->inode, cc->rpages, cc->cluster_size); cc->rpages = NULL; cc->nr_rpages = 0; cc->nr_cpages = 0; - cc->cluster_idx = NULL_CLUSTER; + if (!reuse) + cc->cluster_idx = NULL_CLUSTER; } void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page) @@ -1005,7 +1006,7 @@ static int prepare_compress_overwrite(struct compress_ctx *cc, ret = f2fs_read_multi_pages(cc, &bio, cc->cluster_size, &last_block_in_bio, false, true); f2fs_put_rpages(cc); - f2fs_destroy_compress_ctx(cc); + f2fs_destroy_compress_ctx(cc, true); if (ret) goto out; if (bio) @@ -1032,7 +1033,7 @@ static int prepare_compress_overwrite(struct compress_ctx *cc, release_and_retry: f2fs_put_rpages(cc); f2fs_unlock_rpages(cc, i + 1); - f2fs_destroy_compress_ctx(cc); + f2fs_destroy_compress_ctx(cc, true); goto retry; } } @@ -1065,7 +1066,7 @@ static int prepare_compress_overwrite(struct compress_ctx *cc, unlock_pages: f2fs_put_rpages(cc); f2fs_unlock_rpages(cc, i); - f2fs_destroy_compress_ctx(cc); + f2fs_destroy_compress_ctx(cc, true); out: return ret; } @@ -1101,7 +1102,7 @@ bool f2fs_compress_write_end(struct inode *inode, void *fsdata, set_cluster_dirty(&cc); f2fs_put_rpages_wbc(&cc, NULL, false, 1); - f2fs_destroy_compress_ctx(&cc); + f2fs_destroy_compress_ctx(&cc, false); return first_index; } @@ -1321,7 +1322,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, f2fs_put_rpages(cc); page_array_free(cc->inode, cc->cpages, cc->nr_cpages); cc->cpages = NULL; - f2fs_destroy_compress_ctx(cc); + f2fs_destroy_compress_ctx(cc, false); return 0; out_destroy_crypt: @@ -1483,7 +1484,7 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, err = f2fs_write_raw_pages(cc, submitted, wbc, io_type); f2fs_put_rpages_wbc(cc, wbc, false, 0); destroy_out: - f2fs_destroy_compress_ctx(cc); + f2fs_destroy_compress_ctx(cc, false); return err; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f40313d77322..c09773045745 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2272,7 +2272,7 @@ int f2fs_mpage_readpages(struct address_space *mapping, max_nr_pages, &last_block_in_bio, is_readahead, false); - f2fs_destroy_compress_ctx(&cc); + f2fs_destroy_compress_ctx(&cc, false); if (ret) goto set_error_page; } @@ -2315,7 +2315,7 @@ int f2fs_mpage_readpages(struct address_space *mapping, max_nr_pages, &last_block_in_bio, is_readahead, false); - f2fs_destroy_compress_ctx(&cc); + f2fs_destroy_compress_ctx(&cc, false); } } #endif @@ -3023,7 +3023,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, } } if (f2fs_compressed_file(inode)) - f2fs_destroy_compress_ctx(&cc); + f2fs_destroy_compress_ctx(&cc, false); #endif if (retry) { index = 0; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fe3b0abf3507..c04e06052383 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3964,7 +3964,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc); void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed); void f2fs_put_page_dic(struct page *page); int f2fs_init_compress_ctx(struct compress_ctx *cc); -void f2fs_destroy_compress_ctx(struct compress_ctx *cc); +void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse); void f2fs_init_compress_info(struct f2fs_sb_info *sbi); int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi); void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi); From ef58ceffce98a869a4f434361fe304f2abf32c17 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 11 May 2021 14:38:47 -0700 Subject: [PATCH 243/580] f2fs: avoid swapon failure by giving a warning first The final solution can be migrating blocks to form a section-aligned file internally. Meanwhile, let's ask users to do that when preparing the swap file initially like: 1) create() 2) ioctl(F2FS_IOC_SET_PIN_FILE) 3) fallocate() Reported-by: kernel test robot Fixes: 36e4d95891ed ("f2fs: check if swapfile is section-alligned") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index c09773045745..316eafe457b9 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3791,6 +3791,7 @@ static int f2fs_is_file_aligned(struct inode *inode) block_t pblock; unsigned long nr_pblocks; unsigned int blocks_per_sec = BLKS_PER_SEC(sbi); + unsigned int not_aligned = 0; int ret = 0; cur_lblock = 0; @@ -3823,13 +3824,20 @@ static int f2fs_is_file_aligned(struct inode *inode) if ((pblock - main_blkaddr) & (blocks_per_sec - 1) || nr_pblocks & (blocks_per_sec - 1)) { - f2fs_err(sbi, "Swapfile does not align to section"); - ret = -EINVAL; - goto out; + if (f2fs_is_pinned_file(inode)) { + f2fs_err(sbi, "Swapfile does not align to section"); + ret = -EINVAL; + goto out; + } + not_aligned++; } cur_lblock += nr_pblocks; } + if (not_aligned) + f2fs_warn(sbi, "Swapfile (%u) is not align to section: \n" + "\t1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate()", + not_aligned); out: return ret; } @@ -3848,6 +3856,7 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, int nr_extents = 0; unsigned long nr_pblocks; unsigned int blocks_per_sec = BLKS_PER_SEC(sbi); + unsigned int not_aligned = 0; int ret = 0; /* @@ -3886,9 +3895,12 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, if ((pblock - SM_I(sbi)->main_blkaddr) & (blocks_per_sec - 1) || nr_pblocks & (blocks_per_sec - 1)) { - f2fs_err(sbi, "Swapfile does not align to section"); - ret = -EINVAL; - goto out; + if (f2fs_is_pinned_file(inode)) { + f2fs_err(sbi, "Swapfile does not align to section"); + ret = -EINVAL; + goto out; + } + not_aligned++; } if (cur_lblock + nr_pblocks >= sis->max) @@ -3917,6 +3929,11 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, sis->max = cur_lblock; sis->pages = cur_lblock - 1; sis->highest_bit = cur_lblock - 1; + + if (not_aligned) + f2fs_warn(sbi, "Swapfile (%u) is not align to section: \n" + "\t1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate()", + not_aligned); out: return ret; } From ab44caf36dac040440ee3e8f0e4dc58a7d2717fc Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 12 May 2021 07:38:00 -0700 Subject: [PATCH 244/580] f2fs: return EINVAL for hole cases in swap file This tries to fix xfstests/generic/495. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 316eafe457b9..bbbb3b212c1e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3886,7 +3886,7 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, /* hole */ if (!(map.m_flags & F2FS_MAP_FLAGS)) { f2fs_err(sbi, "Swapfile has holes\n"); - ret = -ENOENT; + ret = -EINVAL; goto out; } @@ -4042,7 +4042,7 @@ static int check_swap_activate(struct swap_info_struct *sis, return ret; bad_bmap: f2fs_err(sbi, "Swapfile has holes\n"); - return -ENOENT; + return -EINVAL; } static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, From 0293824f30a5644ccd7ccccce8cd20249194e6dc Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 27 Apr 2021 11:07:30 +0800 Subject: [PATCH 245/580] f2fs: compress: rename __cluster_may_compress This patch renames __cluster_may_compress() to cluster_has_invalid_data() for better readability. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 968019134ad7..8c1763304fb7 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -847,7 +847,7 @@ bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index) return is_page_in_cluster(cc, index); } -static bool __cluster_may_compress(struct compress_ctx *cc) +static bool cluster_has_invalid_data(struct compress_ctx *cc) { loff_t i_size = i_size_read(cc->inode); unsigned nr_pages = DIV_ROUND_UP(i_size, PAGE_SIZE); @@ -860,9 +860,9 @@ static bool __cluster_may_compress(struct compress_ctx *cc) /* beyond EOF */ if (page->index >= nr_pages) - return false; + return true; } - return true; + return false; } static int __f2fs_cluster_blocks(struct compress_ctx *cc, bool compr) @@ -938,7 +938,7 @@ static bool cluster_may_compress(struct compress_ctx *cc) return false; if (unlikely(f2fs_cp_error(F2FS_I_SB(cc->inode)))) return false; - return __cluster_may_compress(cc); + return !cluster_has_invalid_data(cc); } static void set_cluster_writeback(struct compress_ctx *cc) From b23e6fe5acba423c7db36021443258c3669324b2 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 27 Apr 2021 11:07:30 +0800 Subject: [PATCH 246/580] f2fs: add cp_error check in f2fs_write_compressed_pages This patch adds cp_error check in f2fs_write_compressed_pages() like we did in f2fs_write_single_data_page() Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 8c1763304fb7..3c3b16a60d71 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1187,6 +1187,12 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, loff_t psize; int i, err; + /* we should bypass data pages to proceed the kworkder jobs */ + if (unlikely(f2fs_cp_error(sbi))) { + mapping_set_error(cc->rpages[0]->mapping, -EIO); + goto out_free; + } + if (IS_NOQUOTA(inode)) { /* * We need to wait for node_write to avoid block allocation during From 5f39b604a1694642d546ff9821816bc23a55ac75 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 7 May 2021 18:10:38 +0800 Subject: [PATCH 247/580] f2fs: fix to avoid racing on fsync_entry_slab by multi filesystem instances As syzbot reported, there is an use-after-free issue during f2fs recovery: Use-after-free write at 0xffff88823bc16040 (in kfence-#10): kmem_cache_destroy+0x1f/0x120 mm/slab_common.c:486 f2fs_recover_fsync_data+0x75b0/0x8380 fs/f2fs/recovery.c:869 f2fs_fill_super+0x9393/0xa420 fs/f2fs/super.c:3945 mount_bdev+0x26c/0x3a0 fs/super.c:1367 legacy_get_tree+0xea/0x180 fs/fs_context.c:592 vfs_get_tree+0x86/0x270 fs/super.c:1497 do_new_mount fs/namespace.c:2905 [inline] path_mount+0x196f/0x2be0 fs/namespace.c:3235 do_mount fs/namespace.c:3248 [inline] __do_sys_mount fs/namespace.c:3456 [inline] __se_sys_mount+0x2f9/0x3b0 fs/namespace.c:3433 do_syscall_64+0x3f/0xb0 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae The root cause is multi f2fs filesystem instances can race on accessing global fsync_entry_slab pointer, result in use-after-free issue of slab cache, fixes to init/destroy this slab cache only once during module init/destroy procedure to avoid this issue. Reported-by: syzbot+9d90dad32dd9727ed084@syzkaller.appspotmail.com Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/recovery.c | 23 ++++++++++++++--------- fs/f2fs/super.c | 8 +++++++- 3 files changed, 23 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c04e06052383..246da14e8cc0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3574,6 +3574,8 @@ void f2fs_destroy_garbage_collection_cache(void); */ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only); bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi); +int __init f2fs_create_recovery_cache(void); +void f2fs_destroy_recovery_cache(void); /* * debug.c diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 8b1ef8834b93..fd289fc6ed6a 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -787,13 +787,6 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY); #endif - fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", - sizeof(struct fsync_inode_entry)); - if (!fsync_entry_slab) { - err = -ENOMEM; - goto out; - } - INIT_LIST_HEAD(&inode_list); INIT_LIST_HEAD(&tmp_inode_list); INIT_LIST_HEAD(&dir_list); @@ -851,8 +844,6 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) } } - kmem_cache_destroy(fsync_entry_slab); -out: #ifdef CONFIG_QUOTA /* Turn quotas off */ if (quota_enabled) @@ -862,3 +853,17 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) return ret ? ret : err; } + +int __init f2fs_create_recovery_cache(void) +{ + fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", + sizeof(struct fsync_inode_entry)); + if (!fsync_entry_slab) + return -ENOMEM; + return 0; +} + +void f2fs_destroy_recovery_cache(void) +{ + kmem_cache_destroy(fsync_entry_slab); +} diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 982b24b8e522..0ab641c7eb57 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4176,9 +4176,12 @@ static int __init init_f2fs_fs(void) err = f2fs_create_checkpoint_caches(); if (err) goto free_segment_manager_caches; - err = f2fs_create_extent_cache(); + err = f2fs_create_recovery_cache(); if (err) goto free_checkpoint_caches; + err = f2fs_create_extent_cache(); + if (err) + goto free_recovery_cache; err = f2fs_create_garbage_collection_cache(); if (err) goto free_extent_cache; @@ -4227,6 +4230,8 @@ static int __init init_f2fs_fs(void) f2fs_destroy_garbage_collection_cache(); free_extent_cache: f2fs_destroy_extent_cache(); +free_recovery_cache: + f2fs_destroy_recovery_cache(); free_checkpoint_caches: f2fs_destroy_checkpoint_caches(); free_segment_manager_caches: @@ -4252,6 +4257,7 @@ static void __exit exit_f2fs_fs(void) f2fs_exit_sysfs(); f2fs_destroy_garbage_collection_cache(); f2fs_destroy_extent_cache(); + f2fs_destroy_recovery_cache(); f2fs_destroy_checkpoint_caches(); f2fs_destroy_segment_manager_caches(); f2fs_destroy_node_manager_caches(); From 01a63ad35fe05b560e138ce1df37cefbe9fb736b Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Mon, 10 May 2021 20:24:44 +0900 Subject: [PATCH 248/580] f2fs: Prevent swap file in LFS mode The kernel writes to swap files on f2fs directly without the assistance of the filesystem. This direct write by kernel can be non-sequential even when the f2fs is in LFS mode. Such non-sequential write conflicts with the LFS semantics. Especially when f2fs is set up on zoned block devices, the non-sequential write causes unaligned write command errors. To avoid the non-sequential writes to swap files, prevent swap file activation when the filesystem is in LFS mode. Fixes: 4969c06a0d83 ("f2fs: support swap file w/ DIO") Signed-off-by: Shin'ichiro Kawasaki Cc: stable@vger.kernel.org # v5.10+ Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index bbbb3b212c1e..0e5895c78c83 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -4057,6 +4057,12 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, if (f2fs_readonly(F2FS_I_SB(inode)->sb)) return -EROFS; + if (f2fs_lfs_mode(F2FS_I_SB(inode))) { + f2fs_err(F2FS_I_SB(inode), + "Swapfile not supported in LFS mode"); + return -EINVAL; + } + ret = f2fs_convert_inline_inode(inode); if (ret) return ret; From 70f45238a303f8864caf2721cd3a78637a8af28f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 May 2021 18:17:34 +0800 Subject: [PATCH 249/580] f2fs: atgc: fix to set default age threshold Default age threshold value is missed to set, fix it. Fixes: 093749e296e2 ("f2fs: support age threshold based garbage collection") Reported-by: Sahitya Tummala Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 8d1f17ab94d8..b40a2da90147 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1822,6 +1822,7 @@ static void init_atgc_management(struct f2fs_sb_info *sbi) am->candidate_ratio = DEF_GC_THREAD_CANDIDATE_RATIO; am->max_candidate_count = DEF_GC_THREAD_MAX_CANDIDATE_COUNT; am->age_weight = DEF_GC_THREAD_AGE_WEIGHT; + am->age_threshold = DEF_GC_THREAD_AGE_THRESHOLD; } void f2fs_build_gc_manager(struct f2fs_sb_info *sbi) From cd200b1c597fc31840f887f0508120219a01f58c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 12 May 2021 17:52:56 +0800 Subject: [PATCH 250/580] f2fs: compress: remove unneeded f2fs_put_dnode() If we don't initialize dn.inode_page for f2fs_get_block(), f2fs_get_block() will call f2fs_put_dnode() itself, so let's remove unneeded f2fs_put_dnode() in f2fs_vm_page_mkwrite(). Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ed3d919109ec..c70a15ae5cce 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -116,7 +116,6 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_block(&dn, page->index); - f2fs_put_dnode(&dn); f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); } From 6a6d6763db562c0f7fb891ac65cba08649b2e9a3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 12 May 2021 17:52:57 +0800 Subject: [PATCH 251/580] f2fs: compress: clean up parameter of __f2fs_cluster_blocks() Previously, in order to reuse __f2fs_cluster_blocks(), f2fs_is_compressed_cluster() assigned a compress_ctx type variable, which is used to pass few parameters (cc.inode, cc.cluster_size, cc.cluster_idx), it's wasteful to allocate such large space in stack. Let's clean up parameters of __f2fs_cluster_blocks() to avoid that. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 3c3b16a60d71..d57f858b8e77 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -865,14 +865,17 @@ static bool cluster_has_invalid_data(struct compress_ctx *cc) return false; } -static int __f2fs_cluster_blocks(struct compress_ctx *cc, bool compr) +static int __f2fs_cluster_blocks(struct inode *inode, + unsigned int cluster_idx, bool compr) { struct dnode_of_data dn; + unsigned int cluster_size = F2FS_I(inode)->i_cluster_size; + unsigned int start_idx = cluster_idx << + F2FS_I(inode)->i_log_cluster_size; int ret; - set_new_dnode(&dn, cc->inode, NULL, NULL, 0); - ret = f2fs_get_dnode_of_data(&dn, start_idx_of_cluster(cc), - LOOKUP_NODE); + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); if (ret) { if (ret == -ENOENT) ret = 0; @@ -883,7 +886,7 @@ static int __f2fs_cluster_blocks(struct compress_ctx *cc, bool compr) int i; ret = 1; - for (i = 1; i < cc->cluster_size; i++) { + for (i = 1; i < cluster_size; i++) { block_t blkaddr; blkaddr = data_blkaddr(dn.inode, @@ -905,25 +908,15 @@ static int __f2fs_cluster_blocks(struct compress_ctx *cc, bool compr) /* return # of compressed blocks in compressed cluster */ static int f2fs_compressed_blocks(struct compress_ctx *cc) { - return __f2fs_cluster_blocks(cc, true); + return __f2fs_cluster_blocks(cc->inode, cc->cluster_idx, true); } /* return # of valid blocks in compressed cluster */ -static int f2fs_cluster_blocks(struct compress_ctx *cc) -{ - return __f2fs_cluster_blocks(cc, false); -} - int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index) { - struct compress_ctx cc = { - .inode = inode, - .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, - .cluster_size = F2FS_I(inode)->i_cluster_size, - .cluster_idx = index >> F2FS_I(inode)->i_log_cluster_size, - }; - - return f2fs_cluster_blocks(&cc); + return __f2fs_cluster_blocks(inode, + index >> F2FS_I(inode)->i_log_cluster_size, + false); } static bool cluster_may_compress(struct compress_ctx *cc) @@ -974,7 +967,7 @@ static int prepare_compress_overwrite(struct compress_ctx *cc, bool prealloc; retry: - ret = f2fs_cluster_blocks(cc); + ret = f2fs_is_compressed_cluster(cc->inode, start_idx); if (ret <= 0) return ret; From 067d46d1333a46959207340966adff62894f0296 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Sat, 15 May 2021 11:09:41 -0700 Subject: [PATCH 252/580] f2fs: return success if there is no work to do Static analysis reports this problem file.c:3206:2: warning: Undefined or garbage value returned to caller return err; ^~~~~~~~~~ err is only set if there is some work to do. Because the loop returns immediately on an error, if all the work was done, a 0 would be returned. Instead of checking the unlikely case that there was no work to do, change the return of err to 0. Signed-off-by: Tom Rix Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c70a15ae5cce..1edf2390433b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -3350,7 +3350,7 @@ int f2fs_precache_extents(struct inode *inode) map.m_lblk = m_next_extent; } - return err; + return 0; } static int f2fs_ioc_precache_extents(struct file *filp, unsigned long arg) From 468373d4609011f48b9a67a28e5491bd06237790 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 18 May 2021 09:57:54 +0800 Subject: [PATCH 253/580] f2fs: add MODULE_SOFTDEP to ensure crc32 is included in the initramfs As marcosfrm reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=213089 Initramfs generators rely on "pre" softdeps (and "depends") to include additional required modules. F2FS does not declare "pre: crc32" softdep. Then every generator (dracut, mkinitcpio...) has to maintain a hardcoded list for this purpose. Hence let's use MODULE_SOFTDEP("pre: crc32") in f2fs code. Fixes: 43b6573bac95 ("f2fs: use cryptoapi crc32 functions") Reported-by: marcosfrm Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 0ab641c7eb57..e374305c34ff 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4270,4 +4270,5 @@ module_exit(exit_f2fs_fs) MODULE_AUTHOR("Samsung Electronics's Praesto Team"); MODULE_DESCRIPTION("Flash Friendly File System"); MODULE_LICENSE("GPL"); +MODULE_SOFTDEP("pre: crc32"); From 9bf60aff807c7a5b47c9bdad333ee68601e17541 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 25 May 2021 11:10:53 -0700 Subject: [PATCH 254/580] f2fs: let's allow compression for mmap files This patch allows to compress mmap files. E.g., for so files. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index d57f858b8e77..a3563d62fa1f 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -925,8 +925,6 @@ static bool cluster_may_compress(struct compress_ctx *cc) return false; if (f2fs_is_atomic_file(cc->inode)) return false; - if (f2fs_is_mmap_file(cc->inode)) - return false; if (!f2fs_cluster_is_full(cc)) return false; if (unlikely(f2fs_cp_error(F2FS_I_SB(cc->inode)))) From 035cf9dc44c2eac79ae5a104031769563dad910a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 18 May 2021 17:54:58 +0800 Subject: [PATCH 255/580] f2fs: compress: fix to disallow temp extension This patch restricts to configure compress extension as format of: [filename + '.' + extension] rather than: [filename + '.' + extension + (optional: '.' + temp extension)] in order to avoid to enable compression incorrectly: 1. compress_extension=so 2. touch file.soa 3. touch file.so.tmp Fixes: 4c8ff7095bef ("f2fs: support data compression") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index fc6ebf22239b..dc782dc4d6da 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -148,7 +148,8 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) return ERR_PTR(err); } -static inline int is_extension_exist(const unsigned char *s, const char *sub) +static inline int is_extension_exist(const unsigned char *s, const char *sub, + bool tmp_ext) { size_t slen = strlen(s); size_t sublen = strlen(sub); @@ -164,6 +165,13 @@ static inline int is_extension_exist(const unsigned char *s, const char *sub) if (slen < sublen + 2) return 0; + if (!tmp_ext) { + /* file has no temp extension */ + if (s[slen - sublen - 1] != '.') + return 0; + return !strncasecmp(s + slen - sublen, sub, sublen); + } + for (i = 1; i < slen - sublen; i++) { if (s[i] != '.') continue; @@ -189,7 +197,7 @@ static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode * hot_count = sbi->raw_super->hot_ext_count; for (i = 0; i < cold_count + hot_count; i++) { - if (is_extension_exist(name, extlist[i])) + if (is_extension_exist(name, extlist[i], true)) break; } @@ -290,7 +298,7 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode, hot_count = sbi->raw_super->hot_ext_count; for (i = cold_count; i < cold_count + hot_count; i++) { - if (is_extension_exist(name, extlist[i])) { + if (is_extension_exist(name, extlist[i], false)) { up_read(&sbi->sb_lock); return; } @@ -301,7 +309,7 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode, ext = F2FS_OPTION(sbi).extensions; for (i = 0; i < ext_cnt; i++) { - if (!is_extension_exist(name, ext[i])) + if (!is_extension_exist(name, ext[i], false)) continue; set_compress_context(inode); From 84e2a94763b4eb76c67226835be050c7fc6e7ba3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 12 May 2021 10:07:19 +0800 Subject: [PATCH 256/580] f2fs: atgc: export entries for better tunability via sysfs This patch export below sysfs entries for better ATGC tunability. /sys/fs/f2fs//atgc_candidate_ratio /sys/fs/f2fs//atgc_candidate_count /sys/fs/f2fs//atgc_age_weight /sys/fs/f2fs//atgc_age_threshold Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 28 +++++++++++++++++++++++++ fs/f2fs/sysfs.c | 27 ++++++++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index e61cf7da8e40..d656e4af96c8 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -418,3 +418,31 @@ Description: Show the count of inode newly enabled for compression since mount. Note that when the compression is disabled for the files, this count doesn't decrease. If you write "0" here, you can initialize compr_new_inode to "0". + +What: /sys/fs/f2fs//atgc_candidate_ratio +Date: May 2021 +Contact: "Chao Yu" +Description: When ATGC is on, it controls candidate ratio in order to limit total + number of potential victim in all candidates, the value should be in + range of [0, 100], by default it was initialized as 20(%). + +What: /sys/fs/f2fs//atgc_candidate_count +Date: May 2021 +Contact: "Chao Yu" +Description: When ATGC is on, it controls candidate count in order to limit total + number of potential victim in all candidates, by default it was + initialized as 10 (sections). + +What: /sys/fs/f2fs//atgc_age_weight +Date: May 2021 +Contact: "Chao Yu" +Description: When ATGC is on, it controls age weight to balance weight proportion + in between aging and valid blocks, the value should be in range of + [0, 100], by default it was initialized as 60(%). + +What: /sys/fs/f2fs//atgc_age_threshold +Date: May 2021 +Contact: "Chao Yu" +Description: When ATGC is on, it controls age threshold to bypass GCing young + candidates whose age is not beyond the threshold, by default it was + initialized as 604800 seconds (equals to 7 days). diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 27488290d935..b116a127ef91 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -37,6 +37,7 @@ enum { #endif RESERVED_BLOCKS, /* struct f2fs_sb_info */ CPRC_INFO, /* struct ckpt_req_control */ + ATGC_INFO, /* struct atgc_management */ }; struct f2fs_attr { @@ -75,6 +76,8 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) #endif else if (struct_type == CPRC_INFO) return (unsigned char *)&sbi->cprc_info; + else if (struct_type == ATGC_INFO) + return (unsigned char *)&sbi->am; return NULL; } @@ -498,6 +501,20 @@ static ssize_t __sbi_store(struct f2fs_attr *a, } #endif + if (!strcmp(a->attr.name, "atgc_candidate_ratio")) { + if (t > 100) + return -EINVAL; + sbi->am.candidate_ratio = t; + return count; + } + + if (!strcmp(a->attr.name, "atgc_age_weight")) { + if (t > 100) + return -EINVAL; + sbi->am.age_weight = t; + return count; + } + *ui = (unsigned int)t; return count; @@ -713,6 +730,11 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_written_block, compr_written_block); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_saved_block, compr_saved_block); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_new_inode, compr_new_inode); #endif +/* For ATGC */ +F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_candidate_ratio, candidate_ratio); +F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_candidate_count, max_candidate_count); +F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_weight, age_weight); +F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_threshold, age_threshold); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -781,6 +803,11 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(compr_saved_block), ATTR_LIST(compr_new_inode), #endif + /* For ATGC */ + ATTR_LIST(atgc_candidate_ratio), + ATTR_LIST(atgc_candidate_count), + ATTR_LIST(atgc_age_weight), + ATTR_LIST(atgc_age_threshold), NULL, }; From d84ac50992774952fd56bb9ae3a16e554a171465 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 25 May 2021 19:39:09 +0800 Subject: [PATCH 257/580] f2fs: avoid attaching SB_ACTIVE flag during mount/remount Quoted from [1] "I do remember that I've added this code back then because otherwise orphan cleanup was losing updates to quota files. But you're right that now I don't see how that could be happening and it would be nice if we could get rid of this hack" [1] https://lore.kernel.org/linux-ext4/99cce8ca-e4a0-7301-840f-2ace67c551f3@huawei.com/T/#m04990cfbc4f44592421736b504afcc346b2a7c00 Related fix in ext4 by commit 72ffb49a7b62 ("ext4: do not set SB_ACTIVE in ext4_orphan_cleanup()"). f2fs has the same hack implementation in - f2fs_recover_orphan_inodes() - f2fs_recover_fsync_data() - f2fs_disable_checkpoint() Let's get rid of this hack as well in f2fs. Cc: Zhang Yi Cc: Jan Kara Signed-off-by: Chao Yu Acked-by: Jan Kara Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 3 --- fs/f2fs/recovery.c | 8 ++------ fs/f2fs/super.c | 11 ++++------- 3 files changed, 6 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 134f8ffb6ee3..4424bedffdc2 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -691,9 +691,6 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) } #ifdef CONFIG_QUOTA - /* Needed for iput() to work correctly and not trash data */ - sbi->sb->s_flags |= SB_ACTIVE; - /* * Turn on quotas which were not enabled for read-only mounts if * filesystem has quota feature, so that they are updated correctly. diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index fd289fc6ed6a..d4259868c191 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -781,8 +781,6 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) } #ifdef CONFIG_QUOTA - /* Needed for iput() to work correctly and not trash data */ - sbi->sb->s_flags |= SB_ACTIVE; /* Turn on quotas so that they are updated correctly */ quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY); #endif @@ -810,10 +808,8 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) err = recover_data(sbi, &inode_list, &tmp_inode_list, &dir_list); if (!err) f2fs_bug_on(sbi, !list_empty(&inode_list)); - else { - /* restore s_flags to let iput() trash data */ - sbi->sb->s_flags = s_flags; - } + else + f2fs_bug_on(sbi, sbi->sb->s_flags & SB_ACTIVE); skip: destroy_fsync_dnodes(&inode_list, err); destroy_fsync_dnodes(&tmp_inode_list, err); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e374305c34ff..f1e0deb29d1a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1853,17 +1853,15 @@ static int f2fs_enable_quotas(struct super_block *sb); static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) { - unsigned int s_flags = sbi->sb->s_flags; struct cp_control cpc; int err = 0; int ret; block_t unusable; - if (s_flags & SB_RDONLY) { + if (sbi->sb->s_flags & SB_RDONLY) { f2fs_err(sbi, "checkpoint=disable on readonly fs"); return -EINVAL; } - sbi->sb->s_flags |= SB_ACTIVE; f2fs_update_time(sbi, DISABLE_TIME); @@ -1881,13 +1879,13 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) ret = sync_filesystem(sbi->sb); if (ret || err) { err = ret ? ret : err; - goto restore_flag; + goto out; } unusable = f2fs_get_unusable_blocks(sbi); if (f2fs_disable_cp_again(sbi, unusable)) { err = -EAGAIN; - goto restore_flag; + goto out; } down_write(&sbi->gc_lock); @@ -1903,8 +1901,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) out_unlock: up_write(&sbi->gc_lock); -restore_flag: - sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ +out: return err; } From 67f68dc2808c5010f0bde68c5e1e625f017b47ba Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 12 May 2021 17:52:58 +0800 Subject: [PATCH 258/580] f2fs: compress: remove unneeded preallocation We will reserve iblocks for compression saved, so during compressed cluster overwrite, we don't need to preallocate blocks for later write. In addition, it adds a bug_on to detect wrong reserved iblock number in __f2fs_cluster_blocks(). Bug fix in the original patch by Jaegeuk: If we released compressed blocks having an immutable bit, we can see less number of compressed block addresses. Let's fix wrong BUG_ON. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 27 +++------------------------ fs/f2fs/file.c | 4 ---- 2 files changed, 3 insertions(+), 28 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index a3563d62fa1f..6adc9c94f260 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -899,6 +899,9 @@ static int __f2fs_cluster_blocks(struct inode *inode, ret++; } } + + f2fs_bug_on(F2FS_I_SB(inode), + !compr && ret != cluster_size && !IS_IMMUTABLE(inode)); } fail: f2fs_put_dnode(&dn); @@ -957,21 +960,16 @@ static int prepare_compress_overwrite(struct compress_ctx *cc, struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); struct address_space *mapping = cc->inode->i_mapping; struct page *page; - struct dnode_of_data dn; sector_t last_block_in_bio; unsigned fgp_flag = FGP_LOCK | FGP_WRITE | FGP_CREAT; pgoff_t start_idx = start_idx_of_cluster(cc); int i, ret; - bool prealloc; retry: ret = f2fs_is_compressed_cluster(cc->inode, start_idx); if (ret <= 0) return ret; - /* compressed case */ - prealloc = (ret < cc->cluster_size); - ret = f2fs_init_compress_ctx(cc); if (ret) return ret; @@ -1029,25 +1027,6 @@ static int prepare_compress_overwrite(struct compress_ctx *cc, } } - if (prealloc) { - f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); - - set_new_dnode(&dn, cc->inode, NULL, NULL, 0); - - for (i = cc->cluster_size - 1; i > 0; i--) { - ret = f2fs_get_block(&dn, start_idx + i); - if (ret) { - i = cc->cluster_size; - break; - } - - if (dn.data_blkaddr != NEW_ADDR) - break; - } - - f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); - } - if (likely(!ret)) { *fsdata = cc->rpages; *pagep = cc->rpages[offset_in_cluster(cc, index)]; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 1edf2390433b..6ca0f9c8baec 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -84,10 +84,6 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) err = ret; goto err; } else if (ret) { - if (ret < F2FS_I(inode)->i_cluster_size) { - err = -EAGAIN; - goto err; - } need_alloc = false; } } From 387fdef43e408fafb1a4b9ddf6ecf36890a95278 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 25 May 2021 11:39:35 -0700 Subject: [PATCH 259/580] f2fs: introduce FI_COMPRESS_RELEASED instead of using IMMUTABLE bit Once we release compressed blocks, we used to set IMMUTABLE bit. But it turned out it disallows every fs operations which we don't need for compression. Let's just prevent writing data only. Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 3 ++- fs/f2fs/f2fs.h | 6 ++++++ fs/f2fs/file.c | 18 ++++++++++++------ include/linux/f2fs_fs.h | 1 + 4 files changed, 21 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 6adc9c94f260..0fea2275df19 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -901,7 +901,8 @@ static int __f2fs_cluster_blocks(struct inode *inode, } f2fs_bug_on(F2FS_I_SB(inode), - !compr && ret != cluster_size && !IS_IMMUTABLE(inode)); + !compr && ret != cluster_size && + !is_inode_flag_set(inode, FI_COMPRESS_RELEASED)); } fail: f2fs_put_dnode(&dn); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 246da14e8cc0..5d5cbfd45f10 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -705,6 +705,7 @@ enum { FI_COMPRESS_CORRUPT, /* indicate compressed cluster is corrupted */ FI_MMAP_FILE, /* indicate file was mmapped */ FI_ENABLE_COMPRESS, /* enable compression in "user" compression mode */ + FI_COMPRESS_RELEASED, /* compressed blocks were released */ FI_MAX, /* max flag, never be used */ }; @@ -2684,6 +2685,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode, case FI_DATA_EXIST: case FI_INLINE_DOTS: case FI_PIN_FILE: + case FI_COMPRESS_RELEASED: f2fs_mark_inode_dirty_sync(inode, true); } } @@ -2805,6 +2807,8 @@ static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) set_bit(FI_EXTRA_ATTR, fi->flags); if (ri->i_inline & F2FS_PIN_FILE) set_bit(FI_PIN_FILE, fi->flags); + if (ri->i_inline & F2FS_COMPRESS_RELEASED) + set_bit(FI_COMPRESS_RELEASED, fi->flags); } static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) @@ -2825,6 +2829,8 @@ static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) ri->i_inline |= F2FS_EXTRA_ATTR; if (is_inode_flag_set(inode, FI_PIN_FILE)) ri->i_inline |= F2FS_PIN_FILE; + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) + ri->i_inline |= F2FS_COMPRESS_RELEASED; } static inline int f2fs_has_extra_attr(struct inode *inode) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 6ca0f9c8baec..c65185eac992 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -62,6 +62,9 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) if (unlikely(IS_IMMUTABLE(inode))) return VM_FAULT_SIGBUS; + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) + return VM_FAULT_SIGBUS; + if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; goto err; @@ -3568,7 +3571,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) goto out; } - if (IS_IMMUTABLE(inode)) { + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { ret = -EINVAL; goto out; } @@ -3577,8 +3580,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) if (ret) goto out; - F2FS_I(inode)->i_flags |= F2FS_IMMUTABLE_FL; - f2fs_set_inode_flags(inode); + set_inode_flag(inode, FI_COMPRESS_RELEASED); inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, true); @@ -3733,7 +3735,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) inode_lock(inode); - if (!IS_IMMUTABLE(inode)) { + if (!is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { ret = -EINVAL; goto unlock_inode; } @@ -3778,8 +3780,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) up_write(&F2FS_I(inode)->i_mmap_sem); if (ret >= 0) { - F2FS_I(inode)->i_flags &= ~F2FS_IMMUTABLE_FL; - f2fs_set_inode_flags(inode); + clear_inode_flag(inode, FI_COMPRESS_RELEASED); inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, true); } @@ -4402,6 +4403,11 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) goto unlock; } + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { + ret = -EPERM; + goto unlock; + } + ret = generic_write_checks(iocb, from); if (ret > 0) { bool preallocated = false; diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 5487a80617a3..f93000c3a127 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -229,6 +229,7 @@ struct f2fs_extent { #define F2FS_INLINE_DOTS 0x10 /* file having implicit dot dentries */ #define F2FS_EXTRA_ATTR 0x20 /* file having extra attribute */ #define F2FS_PIN_FILE 0x40 /* file should not be gced */ +#define F2FS_COMPRESS_RELEASED 0x80 /* file released compressed blocks */ struct f2fs_inode { __le16 i_mode; /* file mode */ From 6d845e6005630a590767256c0926812038cb1e80 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 28 Apr 2021 17:20:31 +0800 Subject: [PATCH 260/580] f2fs: restructure f2fs page.private layout Restruct f2fs page private layout for below reasons: There are some cases that f2fs wants to set a flag in a page to indicate a specified status of page: a) page is in transaction list for atomic write b) page contains dummy data for aligned write c) page is migrating for GC d) page contains inline data for inline inode flush e) page belongs to merkle tree, and is verified for fsverity f) page is dirty and has filesystem/inode reference count for writeback g) page is temporary and has decompress io context reference for compression There are existed places in page structure we can use to store f2fs private status/data: - page.flags: PG_checked, PG_private - page.private However it was a mess when we using them, which may cause potential confliction: page.private PG_private PG_checked page._refcount (+1 at most) a) -1 set +1 b) -2 set c), d), e) set f) 0 set +1 g) pointer set The other problem is page.flags has no free slot, if we can avoid set zero to page.private and set PG_private flag, then we use non-zero value to indicate PG_private status, so that we may have chance to reclaim PG_private slot for other usage. [1] The other concern is f2fs has bad scalability in aspect of indicating more page status. So in this patch, let's restructure f2fs' page.private as below to solve above issues: Layout A: lowest bit should be 1 | bit0 = 1 | bit1 | bit2 | ... | bit MAX | private data .... | bit 0 PAGE_PRIVATE_NOT_POINTER bit 1 PAGE_PRIVATE_ATOMIC_WRITE bit 2 PAGE_PRIVATE_DUMMY_WRITE bit 3 PAGE_PRIVATE_ONGOING_MIGRATION bit 4 PAGE_PRIVATE_INLINE_INODE bit 5 PAGE_PRIVATE_REF_RESOURCE bit 6- f2fs private data Layout B: lowest bit should be 0 page.private is a wrapped pointer. After the change: page.private PG_private PG_checked page._refcount (+1 at most) a) 11 set +1 b) 101 set +1 c) 1001 set +1 d) 10001 set +1 e) set f) 100001 set +1 g) pointer set +1 [1] https://lore.kernel.org/linux-f2fs-devel/20210422154705.GO3596236@casper.infradead.org/T/#u Cc: Matthew Wilcox Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 +- fs/f2fs/compress.c | 10 ++-- fs/f2fs/data.c | 65 ++++++++++++---------- fs/f2fs/dir.c | 8 ++- fs/f2fs/f2fs.h | 126 +++++++++++++++++++++++++++++++++++-------- fs/f2fs/gc.c | 6 +-- fs/f2fs/inline.c | 4 +- fs/f2fs/inode.c | 2 +- fs/f2fs/node.c | 10 ++-- fs/f2fs/node.h | 29 ---------- fs/f2fs/segment.c | 19 +++---- 11 files changed, 175 insertions(+), 108 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 4424bedffdc2..3497bac4c53d 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -444,7 +444,7 @@ static int f2fs_set_meta_page_dirty(struct page *page) if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); - f2fs_set_page_private(page, 0); + set_page_private_reference(page); return 1; } return 0; @@ -1015,7 +1015,7 @@ void f2fs_update_dirty_page(struct inode *inode, struct page *page) inode_inc_dirty_pages(inode); spin_unlock(&sbi->inode_lock[type]); - f2fs_set_page_private(page, 0); + set_page_private_reference(page); } void f2fs_remove_dirty_inode(struct inode *inode) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 0fea2275df19..108e34aa40cd 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -75,7 +75,7 @@ bool f2fs_is_compressed_page(struct page *page) return false; if (!page_private(page)) return false; - if (IS_ATOMIC_WRITTEN_PAGE(page) || IS_DUMMY_WRITTEN_PAGE(page)) + if (page_private_nonpointer(page)) return false; f2fs_bug_on(F2FS_M_SB(page->mapping), @@ -86,8 +86,7 @@ bool f2fs_is_compressed_page(struct page *page) static void f2fs_set_compressed_page(struct page *page, struct inode *inode, pgoff_t index, void *data) { - SetPagePrivate(page); - set_page_private(page, (unsigned long)data); + attach_page_private(page, (void *)data); /* i_crypto_info and iv index */ page->index = index; @@ -560,8 +559,7 @@ static void f2fs_compress_free_page(struct page *page) { if (!page) return; - set_page_private(page, (unsigned long)NULL); - ClearPagePrivate(page); + detach_page_private(page); page->mapping = NULL; unlock_page(page); mempool_free(page, compress_page_pool); @@ -1347,7 +1345,7 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page) for (i = 0; i < cic->nr_rpages; i++) { WARN_ON(!cic->rpages[i]); - clear_cold_data(cic->rpages[i]); + clear_page_private_gcing(cic->rpages[i]); end_page_writeback(cic->rpages[i]); } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0e5895c78c83..0ba8dfb09218 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -56,18 +56,19 @@ static bool __is_cp_guaranteed(struct page *page) if (!mapping) return false; - if (f2fs_is_compressed_page(page)) - return false; - inode = mapping->host; sbi = F2FS_I_SB(inode); if (inode->i_ino == F2FS_META_INO(sbi) || inode->i_ino == F2FS_NODE_INO(sbi) || - S_ISDIR(inode->i_mode) || - (S_ISREG(inode->i_mode) && + S_ISDIR(inode->i_mode)) + return true; + + if (f2fs_is_compressed_page(page)) + return false; + if ((S_ISREG(inode->i_mode) && (f2fs_is_atomic_file(inode) || IS_NOQUOTA(inode))) || - is_cold_data(page)) + page_private_gcing(page)) return true; return false; } @@ -297,9 +298,8 @@ static void f2fs_write_end_io(struct bio *bio) struct page *page = bvec->bv_page; enum count_type type = WB_DATA_TYPE(page); - if (IS_DUMMY_WRITTEN_PAGE(page)) { - set_page_private(page, (unsigned long)NULL); - ClearPagePrivate(page); + if (page_private_dummy(page)) { + clear_page_private_dummy(page); unlock_page(page); mempool_free(page, sbi->write_io_dummy); @@ -329,7 +329,7 @@ static void f2fs_write_end_io(struct bio *bio) dec_page_count(sbi, type); if (f2fs_in_warm_node_list(sbi, page)) f2fs_del_fsync_node_entry(sbi, page); - clear_cold_data(page); + clear_page_private_gcing(page); end_page_writeback(page); } if (!get_pages(sbi, F2FS_WB_CP_DATA) && @@ -436,10 +436,11 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, GFP_NOIO | __GFP_NOFAIL); f2fs_bug_on(sbi, !page); - zero_user_segment(page, 0, PAGE_SIZE); - SetPagePrivate(page); - set_page_private(page, DUMMY_WRITTEN_PAGE); lock_page(page); + + zero_user_segment(page, 0, PAGE_SIZE); + set_page_private_dummy(page); + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) f2fs_bug_on(sbi, 1); } @@ -2470,9 +2471,9 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) if (f2fs_is_atomic_file(inode)) return true; if (fio) { - if (is_cold_data(fio->page)) + if (page_private_gcing(fio->page)) return true; - if (IS_ATOMIC_WRITTEN_PAGE(fio->page)) + if (page_private_dummy(fio->page)) return true; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) && f2fs_is_checkpointed_data(sbi, fio->old_blkaddr))) @@ -2528,7 +2529,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) /* This page is already truncated */ if (fio->old_blkaddr == NULL_ADDR) { ClearPageUptodate(page); - clear_cold_data(page); + clear_page_private_gcing(page); goto out_writepage; } got_it: @@ -2738,7 +2739,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, inode_dec_dirty_pages(inode); if (err) { ClearPageUptodate(page); - clear_cold_data(page); + clear_page_private_gcing(page); } if (wbc->for_reclaim) { @@ -3214,7 +3215,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, f2fs_do_read_inline_data(page, ipage); set_inode_flag(inode, FI_DATA_EXIST); if (inode->i_nlink) - set_inline_node(ipage); + set_page_private_inline(ipage); } else { err = f2fs_convert_inline_page(&dn, page); if (err) @@ -3605,12 +3606,13 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, } } - clear_cold_data(page); + clear_page_private_gcing(page); - if (IS_ATOMIC_WRITTEN_PAGE(page)) + if (page_private_atomic(page)) return f2fs_drop_inmem_page(inode, page); - f2fs_clear_page_private(page); + detach_page_private(page); + set_page_private(page, 0); } int f2fs_release_page(struct page *page, gfp_t wait) @@ -3620,11 +3622,13 @@ int f2fs_release_page(struct page *page, gfp_t wait) return 0; /* This is atomic written page, keep Private */ - if (IS_ATOMIC_WRITTEN_PAGE(page)) + if (page_private_atomic(page)) return 0; - clear_cold_data(page); - f2fs_clear_page_private(page); + clear_page_private_gcing(page); + + detach_page_private(page); + set_page_private(page, 0); return 1; } @@ -3640,7 +3644,7 @@ static int f2fs_set_data_page_dirty(struct page *page) return __set_page_dirty_nobuffers(page); if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) { - if (!IS_ATOMIC_WRITTEN_PAGE(page)) { + if (!page_private_atomic(page)) { f2fs_register_inmem_page(inode, page); return 1; } @@ -3732,7 +3736,7 @@ int f2fs_migrate_page(struct address_space *mapping, { int rc, extra_count; struct f2fs_inode_info *fi = F2FS_I(mapping->host); - bool atomic_written = IS_ATOMIC_WRITTEN_PAGE(page); + bool atomic_written = page_private_atomic(page); BUG_ON(PageWriteback(page)); @@ -3768,8 +3772,13 @@ int f2fs_migrate_page(struct address_space *mapping, } if (PagePrivate(page)) { - f2fs_set_page_private(newpage, page_private(page)); - f2fs_clear_page_private(page); + set_page_private(newpage, page_private(page)); + SetPagePrivate(newpage); + get_page(newpage); + + set_page_private(page, 0); + ClearPagePrivate(page); + put_page(page); } if (mode != MIGRATE_SYNC_NO_COPY) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index e412b26bd86b..ae81155d288a 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -931,11 +931,15 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, !f2fs_truncate_hole(dir, page->index, page->index + 1)) { f2fs_clear_radix_tree_dirty_tag(page); clear_page_dirty_for_io(page); - f2fs_clear_page_private(page); ClearPageUptodate(page); - clear_cold_data(page); + + clear_page_private_gcing(page); + inode_dec_dirty_pages(dir); f2fs_remove_dirty_inode(dir); + + detach_page_private(page); + set_page_private(page, 0); } f2fs_put_page(page, 1); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5d5cbfd45f10..9b318373f905 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1291,17 +1291,85 @@ enum { */ }; -/* - * this value is set in page as a private data which indicate that - * the page is atomically written, and it is in inmem_pages list. - */ -#define ATOMIC_WRITTEN_PAGE ((unsigned long)-1) -#define DUMMY_WRITTEN_PAGE ((unsigned long)-2) +static inline int f2fs_test_bit(unsigned int nr, char *addr); +static inline void f2fs_set_bit(unsigned int nr, char *addr); +static inline void f2fs_clear_bit(unsigned int nr, char *addr); -#define IS_ATOMIC_WRITTEN_PAGE(page) \ - (page_private(page) == ATOMIC_WRITTEN_PAGE) -#define IS_DUMMY_WRITTEN_PAGE(page) \ - (page_private(page) == DUMMY_WRITTEN_PAGE) +/* + * Layout of f2fs page.private: + * + * Layout A: lowest bit should be 1 + * | bit0 = 1 | bit1 | bit2 | ... | bit MAX | private data .... | + * bit 0 PAGE_PRIVATE_NOT_POINTER + * bit 1 PAGE_PRIVATE_ATOMIC_WRITE + * bit 2 PAGE_PRIVATE_DUMMY_WRITE + * bit 3 PAGE_PRIVATE_ONGOING_MIGRATION + * bit 4 PAGE_PRIVATE_INLINE_INODE + * bit 5 PAGE_PRIVATE_REF_RESOURCE + * bit 6- f2fs private data + * + * Layout B: lowest bit should be 0 + * page.private is a wrapped pointer. + */ +enum { + PAGE_PRIVATE_NOT_POINTER, /* private contains non-pointer data */ + PAGE_PRIVATE_ATOMIC_WRITE, /* data page from atomic write path */ + PAGE_PRIVATE_DUMMY_WRITE, /* data page for padding aligned IO */ + PAGE_PRIVATE_ONGOING_MIGRATION, /* data page which is on-going migrating */ + PAGE_PRIVATE_INLINE_INODE, /* inode page contains inline data */ + PAGE_PRIVATE_REF_RESOURCE, /* dirty page has referenced resources */ + PAGE_PRIVATE_MAX +}; + +#define PAGE_PRIVATE_GET_FUNC(name, flagname) \ +static inline bool page_private_##name(struct page *page) \ +{ \ + return test_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)) && \ + test_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ +} + +#define PAGE_PRIVATE_SET_FUNC(name, flagname) \ +static inline void set_page_private_##name(struct page *page) \ +{ \ + if (!PagePrivate(page)) { \ + get_page(page); \ + SetPagePrivate(page); \ + } \ + set_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)); \ + set_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ +} + +#define PAGE_PRIVATE_CLEAR_FUNC(name, flagname) \ +static inline void clear_page_private_##name(struct page *page) \ +{ \ + clear_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ + if (page_private(page) == 1 << PAGE_PRIVATE_NOT_POINTER) { \ + set_page_private(page, 0); \ + if (PagePrivate(page)) { \ + ClearPagePrivate(page); \ + put_page(page); \ + }\ + } \ +} + +PAGE_PRIVATE_GET_FUNC(nonpointer, NOT_POINTER); +PAGE_PRIVATE_GET_FUNC(reference, REF_RESOURCE); +PAGE_PRIVATE_GET_FUNC(inline, INLINE_INODE); +PAGE_PRIVATE_GET_FUNC(gcing, ONGOING_MIGRATION); +PAGE_PRIVATE_GET_FUNC(atomic, ATOMIC_WRITE); +PAGE_PRIVATE_GET_FUNC(dummy, DUMMY_WRITE); + +PAGE_PRIVATE_SET_FUNC(reference, REF_RESOURCE); +PAGE_PRIVATE_SET_FUNC(inline, INLINE_INODE); +PAGE_PRIVATE_SET_FUNC(gcing, ONGOING_MIGRATION); +PAGE_PRIVATE_SET_FUNC(atomic, ATOMIC_WRITE); +PAGE_PRIVATE_SET_FUNC(dummy, DUMMY_WRITE); + +PAGE_PRIVATE_CLEAR_FUNC(reference, REF_RESOURCE); +PAGE_PRIVATE_CLEAR_FUNC(inline, INLINE_INODE); +PAGE_PRIVATE_CLEAR_FUNC(gcing, ONGOING_MIGRATION); +PAGE_PRIVATE_CLEAR_FUNC(atomic, ATOMIC_WRITE); +PAGE_PRIVATE_CLEAR_FUNC(dummy, DUMMY_WRITE); #ifdef CONFIG_FS_ENCRYPTION #define DUMMY_ENCRYPTION_ENABLED(sbi) \ @@ -3181,25 +3249,41 @@ static inline bool __is_valid_data_blkaddr(block_t blkaddr) return true; } -static inline void f2fs_set_page_private(struct page *page, - unsigned long data) +/** + * attach_page_private - Attach private data to a page. + * @page: Page to attach data to. + * @data: Data to attach to page. + * + * Attaching private data to a page increments the page's reference count. + * The data must be detached before the page will be freed. + */ +static inline void attach_page_private(struct page *page, void *data) { - if (PagePrivate(page)) - return; - get_page(page); + set_page_private(page, (unsigned long)data); SetPagePrivate(page); - set_page_private(page, data); } -static inline void f2fs_clear_page_private(struct page *page) +/** + * detach_page_private - Detach private data from a page. + * @page: Page to detach data from. + * + * Removes the data that was previously attached to the page and decrements + * the refcount on the page. + * + * Return: Data that was attached to the page. + */ +static inline void *detach_page_private(struct page *page) { - if (!PagePrivate(page)) - return; + void *data = (void *)page_private(page); - set_page_private(page, 0); + if (!PagePrivate(page)) + return NULL; ClearPagePrivate(page); - f2fs_put_page(page, 0); + set_page_private(page, 0); + put_page(page); + + return data; } /* diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index b40a2da90147..bcb3b488dbca 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1336,7 +1336,7 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, goto out; } set_page_dirty(page); - set_cold_data(page); + set_page_private_gcing(page); } else { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), @@ -1362,11 +1362,11 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, f2fs_remove_dirty_inode(inode); } - set_cold_data(page); + set_page_private_gcing(page); err = f2fs_do_write_data_page(&fio); if (err) { - clear_cold_data(page); + clear_page_private_gcing(page); if (err == -ENOMEM) { congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 38011ef34ac0..3308faef0d10 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -172,7 +172,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) /* clear inline data and flag after data writeback */ f2fs_truncate_inline_inode(dn->inode, dn->inode_page, 0); - clear_inline_node(dn->inode_page); + clear_page_private_inline(dn->inode_page); clear_out: stat_dec_inline_inode(dn->inode); clear_inode_flag(dn->inode, FI_INLINE_DATA); @@ -254,7 +254,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) set_inode_flag(inode, FI_APPEND_WRITE); set_inode_flag(inode, FI_DATA_EXIST); - clear_inline_node(dn.inode_page); + clear_page_private_inline(dn.inode_page); f2fs_put_dnode(&dn); return 0; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 71a354b9fe19..48a448fe700b 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -646,7 +646,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) /* deleted inode */ if (inode->i_nlink == 0) - clear_inline_node(node_page); + clear_page_private_inline(node_page); F2FS_I(inode)->i_disk_time[0] = inode->i_atime; F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index a06eace2a4c8..b5338f5f596e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1860,8 +1860,8 @@ void f2fs_flush_inline_data(struct f2fs_sb_info *sbi) } /* flush inline_data, if it's async context. */ - if (is_inline_node(page)) { - clear_inline_node(page); + if (page_private_inline(page)) { + clear_page_private_inline(page); unlock_page(page); flush_inline_data(sbi, ino_of_node(page)); continue; @@ -1941,8 +1941,8 @@ int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, goto write_node; /* flush inline_data */ - if (is_inline_node(page)) { - clear_inline_node(page); + if (page_private_inline(page)) { + clear_page_private_inline(page); unlock_page(page); flush_inline_data(sbi, ino_of_node(page)); goto lock_node; @@ -2096,7 +2096,7 @@ static int f2fs_set_node_page_dirty(struct page *page) if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); - f2fs_set_page_private(page, 0); + set_page_private_reference(page); return 1; } return 0; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 7a45c0f10629..d85e8659cfda 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -389,20 +389,6 @@ static inline nid_t get_nid(struct page *p, int off, bool i) * - Mark cold node blocks in their node footer * - Mark cold data pages in page cache */ -static inline int is_cold_data(struct page *page) -{ - return PageChecked(page); -} - -static inline void set_cold_data(struct page *page) -{ - SetPageChecked(page); -} - -static inline void clear_cold_data(struct page *page) -{ - ClearPageChecked(page); -} static inline int is_node(struct page *page, int type) { @@ -414,21 +400,6 @@ static inline int is_node(struct page *page, int type) #define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT) #define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT) -static inline int is_inline_node(struct page *page) -{ - return PageChecked(page); -} - -static inline void set_inline_node(struct page *page) -{ - SetPageChecked(page); -} - -static inline void clear_inline_node(struct page *page) -{ - ClearPageChecked(page); -} - static inline void set_cold_node(struct page *page, bool is_dir) { struct f2fs_node *rn = F2FS_NODE(page); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d1657c8547bd..b8c41f564e5f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -186,10 +186,7 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) { struct inmem_pages *new; - if (PagePrivate(page)) - set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE); - else - f2fs_set_page_private(page, ATOMIC_WRITTEN_PAGE); + set_page_private_atomic(page); new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); @@ -272,9 +269,10 @@ static int __revoke_inmem_pages(struct inode *inode, /* we don't need to invalidate this in the sccessful status */ if (drop || recover) { ClearPageUptodate(page); - clear_cold_data(page); + clear_page_private_gcing(page); } - f2fs_clear_page_private(page); + detach_page_private(page); + set_page_private(page, 0); f2fs_put_page(page, 1); list_del(&cur->list); @@ -357,7 +355,7 @@ void f2fs_drop_inmem_page(struct inode *inode, struct page *page) struct list_head *head = &fi->inmem_pages; struct inmem_pages *cur = NULL; - f2fs_bug_on(sbi, !IS_ATOMIC_WRITTEN_PAGE(page)); + f2fs_bug_on(sbi, !page_private_atomic(page)); mutex_lock(&fi->inmem_lock); list_for_each_entry(cur, head, list) { @@ -373,9 +371,12 @@ void f2fs_drop_inmem_page(struct inode *inode, struct page *page) kmem_cache_free(inmem_entry_slab, cur); ClearPageUptodate(page); - f2fs_clear_page_private(page); + clear_page_private_atomic(page); f2fs_put_page(page, 0); + detach_page_private(page); + set_page_private(page, 0); + trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE); } @@ -3288,7 +3289,7 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) if (fio->type == DATA) { struct inode *inode = fio->page->mapping->host; - if (is_cold_data(fio->page)) { + if (page_private_gcing(fio->page)) { if (fio->sbi->am.atgc_enabled && (fio->io_type == FS_DATA_IO) && (fio->sbi->gc_mode != GC_URGENT_HIGH)) From 1ecac6878fbb66c2ff7c10b316727a1fcfb25503 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 26 May 2021 13:05:36 -0700 Subject: [PATCH 261/580] f2fs: logging neatening Update the logging uses that have unnecessary newlines as the f2fs_printk function and so its f2fs_ macro callers already adds one. This allows searching single line logging entries with an easier grep and also avoids unnecessary blank lines in the logging. Miscellanea: o Coalesce formats o Align to open parenthesis Signed-off-by: Joe Perches Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 16 +++++++--------- fs/f2fs/file.c | 12 +++++------- fs/f2fs/gc.c | 4 ++-- fs/f2fs/segment.c | 2 +- fs/f2fs/super.c | 4 ++-- 5 files changed, 17 insertions(+), 21 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0ba8dfb09218..1cea40b33dfe 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3823,7 +3823,7 @@ static int f2fs_is_file_aligned(struct inode *inode) /* hole */ if (!(map.m_flags & F2FS_MAP_FLAGS)) { - f2fs_err(sbi, "Swapfile has holes\n"); + f2fs_err(sbi, "Swapfile has holes"); ret = -ENOENT; goto out; } @@ -3844,9 +3844,8 @@ static int f2fs_is_file_aligned(struct inode *inode) cur_lblock += nr_pblocks; } if (not_aligned) - f2fs_warn(sbi, "Swapfile (%u) is not align to section: \n" - "\t1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate()", - not_aligned); + f2fs_warn(sbi, "Swapfile (%u) is not align to section: 1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate()", + not_aligned); out: return ret; } @@ -3894,7 +3893,7 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, /* hole */ if (!(map.m_flags & F2FS_MAP_FLAGS)) { - f2fs_err(sbi, "Swapfile has holes\n"); + f2fs_err(sbi, "Swapfile has holes"); ret = -EINVAL; goto out; } @@ -3940,9 +3939,8 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, sis->highest_bit = cur_lblock - 1; if (not_aligned) - f2fs_warn(sbi, "Swapfile (%u) is not align to section: \n" - "\t1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate()", - not_aligned); + f2fs_warn(sbi, "Swapfile (%u) is not align to section: 1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate()", + not_aligned); out: return ret; } @@ -4050,7 +4048,7 @@ static int check_swap_activate(struct swap_info_struct *sis, out: return ret; bad_bmap: - f2fs_err(sbi, "Swapfile has holes\n"); + f2fs_err(sbi, "Swapfile has holes"); return -EINVAL; } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c65185eac992..c57328c4e3f0 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -3383,7 +3383,7 @@ static int f2fs_ioc_enable_verity(struct file *filp, unsigned long arg) if (!f2fs_sb_has_verity(F2FS_I_SB(inode))) { f2fs_warn(F2FS_I_SB(inode), - "Can't enable fs-verity on inode %lu: the verity feature is not enabled on this filesystem.\n", + "Can't enable fs-verity on inode %lu: the verity feature is not enabled on this filesystem", inode->i_ino); return -EOPNOTSUPP; } @@ -4164,9 +4164,8 @@ static int f2fs_ioc_decompress_file(struct file *filp, unsigned long arg) LLONG_MAX); if (ret) - f2fs_warn(sbi, "%s: The file might be partially decompressed " - "(errno=%d). Please delete the file.\n", - __func__, ret); + f2fs_warn(sbi, "%s: The file might be partially decompressed (errno=%d). Please delete the file.", + __func__, ret); out: inode_unlock(inode); file_end_write(filp); @@ -4238,9 +4237,8 @@ static int f2fs_ioc_compress_file(struct file *filp, unsigned long arg) clear_inode_flag(inode, FI_ENABLE_COMPRESS); if (ret) - f2fs_warn(sbi, "%s: The file might be partially compressed " - "(errno=%d). Please delete the file.\n", - __func__, ret); + f2fs_warn(sbi, "%s: The file might be partially compressed (errno=%d). Please delete the file.", + __func__, ret); out: inode_unlock(inode); file_end_write(filp); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index bcb3b488dbca..ab1c0123904f 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1031,8 +1031,8 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (unlikely(check_valid_map(sbi, segno, offset))) { if (!test_and_set_bit(segno, SIT_I(sbi)->invalid_segmap)) { - f2fs_err(sbi, "mismatched blkaddr %u (source_blkaddr %u) in seg %u\n", - blkaddr, source_blkaddr, segno); + f2fs_err(sbi, "mismatched blkaddr %u (source_blkaddr %u) in seg %u", + blkaddr, source_blkaddr, segno); f2fs_bug_on(sbi, 1); } } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b8c41f564e5f..debd5a22c59d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3919,7 +3919,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) /* sanity check for summary blocks */ if (nats_in_cursum(nat_j) > NAT_JOURNAL_ENTRIES || sits_in_cursum(sit_j) > SIT_JOURNAL_ENTRIES) { - f2fs_err(sbi, "invalid journal entries nats %u sits %u\n", + f2fs_err(sbi, "invalid journal entries nats %u sits %u", nats_in_cursum(nat_j), sits_in_cursum(sit_j)); return -EINVAL; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f1e0deb29d1a..9aa35be4a1e5 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1135,7 +1135,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) } if (test_opt(sbi, DISABLE_CHECKPOINT) && f2fs_lfs_mode(sbi)) { - f2fs_err(sbi, "LFS not compatible with checkpoint=disable\n"); + f2fs_err(sbi, "LFS not compatible with checkpoint=disable"); return -EINVAL; } @@ -3512,7 +3512,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) #ifdef CONFIG_BLK_DEV_ZONED if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM && !f2fs_sb_has_blkzoned(sbi)) { - f2fs_err(sbi, "Zoned block device feature not enabled\n"); + f2fs_err(sbi, "Zoned block device feature not enabled"); return -EINVAL; } if (bdev_zoned_model(FDEV(i).bdev) != BLK_ZONED_NONE) { From 6c39030b6f3e7215f7b48e0515583a372e281184 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 21 May 2021 01:32:53 -0700 Subject: [PATCH 262/580] f2fs: support RO feature Given RO feature in superblock, we don't need to check provisioning/reserve spaces and SSA area. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 +++ fs/f2fs/segment.c | 4 ++++ fs/f2fs/super.c | 37 +++++++++++++++++++++++++++++++------ fs/f2fs/sysfs.c | 8 ++++++++ 4 files changed, 46 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9b318373f905..45456040c32e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -167,6 +167,7 @@ struct f2fs_mount_info { #define F2FS_FEATURE_SB_CHKSUM 0x0800 #define F2FS_FEATURE_CASEFOLD 0x1000 #define F2FS_FEATURE_COMPRESSION 0x2000 +#define F2FS_FEATURE_RO 0x4000 #define __F2FS_HAS_FEATURE(raw_super, mask) \ ((raw_super->feature & cpu_to_le32(mask)) != 0) @@ -940,6 +941,7 @@ static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode, #define NR_CURSEG_DATA_TYPE (3) #define NR_CURSEG_NODE_TYPE (3) #define NR_CURSEG_INMEM_TYPE (2) +#define NR_CURSEG_RO_TYPE (2) #define NR_CURSEG_PERSIST_TYPE (NR_CURSEG_DATA_TYPE + NR_CURSEG_NODE_TYPE) #define NR_CURSEG_TYPE (NR_CURSEG_INMEM_TYPE + NR_CURSEG_PERSIST_TYPE) @@ -4166,6 +4168,7 @@ F2FS_FEATURE_FUNCS(verity, VERITY); F2FS_FEATURE_FUNCS(sb_chksum, SB_CHKSUM); F2FS_FEATURE_FUNCS(casefold, CASEFOLD); F2FS_FEATURE_FUNCS(compression, COMPRESSION); +F2FS_FEATURE_FUNCS(readonly, RO); #ifdef CONFIG_BLK_DEV_ZONED static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi, diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index debd5a22c59d..0364f1526617 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -4682,6 +4682,10 @@ static int sanity_check_curseg(struct f2fs_sb_info *sbi) struct seg_entry *se = get_seg_entry(sbi, curseg->segno); unsigned int blkofs = curseg->next_blkoff; + if (f2fs_sb_has_readonly(sbi) && + i != CURSEG_HOT_DATA && i != CURSEG_HOT_NODE) + continue; + sanity_check_seg_type(sbi, curseg->seg_type); if (f2fs_test_bit(blkofs, se->cur_valid_map)) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 9aa35be4a1e5..35e4d46648dc 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -552,7 +552,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) int ret; if (!options) - return 0; + goto default_check; while ((p = strsep(&options, ",")) != NULL) { int token; @@ -1072,6 +1072,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) return -EINVAL; } } +default_check: #ifdef CONFIG_QUOTA if (f2fs_check_quota_options(sbi)) return -EINVAL; @@ -1144,6 +1145,11 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) */ if (F2FS_OPTION(sbi).active_logs != NR_CURSEG_TYPE) F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; + + if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) { + f2fs_err(sbi, "Allow to mount readonly mode only"); + return -EROFS; + } return 0; } @@ -1808,7 +1814,11 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) static void default_options(struct f2fs_sb_info *sbi) { /* init some FS parameters */ - F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE; + if (f2fs_sb_has_readonly(sbi)) + F2FS_OPTION(sbi).active_logs = NR_CURSEG_RO_TYPE; + else + F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE; + F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; @@ -1988,6 +1998,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if (f2fs_readonly(sb) && (*flags & SB_RDONLY)) goto skip; + if (f2fs_sb_has_readonly(sbi) && !(*flags & SB_RDONLY)) { + err = -EROFS; + goto restore_opts; + } + #ifdef CONFIG_QUOTA if (!f2fs_readonly(sb) && (*flags & SB_RDONLY)) { err = dquot_suspend(sb, -1); @@ -3101,14 +3116,15 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count); - if (unlikely(fsmeta < F2FS_MIN_META_SEGMENTS || + if (!f2fs_sb_has_readonly(sbi) && + unlikely(fsmeta < F2FS_MIN_META_SEGMENTS || ovp_segments == 0 || reserved_segments == 0)) { f2fs_err(sbi, "Wrong layout: check mkfs.f2fs version"); return 1; } - user_block_count = le64_to_cpu(ckpt->user_block_count); - segment_count_main = le32_to_cpu(raw_super->segment_count_main); + segment_count_main = le32_to_cpu(raw_super->segment_count_main) + + (f2fs_sb_has_readonly(sbi) ? 1 : 0); log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); if (!user_block_count || user_block_count >= segment_count_main << log_blocks_per_seg) { @@ -3139,6 +3155,10 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs || le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg) return 1; + + if (f2fs_sb_has_readonly(sbi)) + goto check_data; + for (j = i + 1; j < NR_CURSEG_NODE_TYPE; j++) { if (le32_to_cpu(ckpt->cur_node_segno[i]) == le32_to_cpu(ckpt->cur_node_segno[j])) { @@ -3149,10 +3169,15 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) } } } +check_data: for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs || le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg) return 1; + + if (f2fs_sb_has_readonly(sbi)) + goto skip_cross; + for (j = i + 1; j < NR_CURSEG_DATA_TYPE; j++) { if (le32_to_cpu(ckpt->cur_data_segno[i]) == le32_to_cpu(ckpt->cur_data_segno[j])) { @@ -3174,7 +3199,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) } } } - +skip_cross: sit_bitmap_size = le32_to_cpu(ckpt->sit_ver_bitmap_bytesize); nat_bitmap_size = le32_to_cpu(ckpt->nat_ver_bitmap_bytesize); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index b116a127ef91..1196e6be5fcc 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -158,6 +158,9 @@ static ssize_t features_show(struct f2fs_attr *a, if (f2fs_sb_has_casefold(sbi)) len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "casefold"); + if (f2fs_sb_has_readonly(sbi)) + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "readonly"); if (f2fs_sb_has_compression(sbi)) len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "compression"); @@ -581,6 +584,7 @@ enum feat_id { FEAT_SB_CHECKSUM, FEAT_CASEFOLD, FEAT_COMPRESSION, + FEAT_RO, FEAT_TEST_DUMMY_ENCRYPTION_V2, }; @@ -602,6 +606,7 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a, case FEAT_SB_CHECKSUM: case FEAT_CASEFOLD: case FEAT_COMPRESSION: + case FEAT_RO: case FEAT_TEST_DUMMY_ENCRYPTION_V2: return sprintf(buf, "supported\n"); } @@ -724,12 +729,14 @@ F2FS_FEATURE_RO_ATTR(verity, FEAT_VERITY); #endif F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM); F2FS_FEATURE_RO_ATTR(casefold, FEAT_CASEFOLD); +F2FS_FEATURE_RO_ATTR(readonly, FEAT_RO); #ifdef CONFIG_F2FS_FS_COMPRESSION F2FS_FEATURE_RO_ATTR(compression, FEAT_COMPRESSION); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_written_block, compr_written_block); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_saved_block, compr_saved_block); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_new_inode, compr_new_inode); #endif + /* For ATGC */ F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_candidate_ratio, candidate_ratio); F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_candidate_count, max_candidate_count); @@ -832,6 +839,7 @@ static struct attribute *f2fs_feat_attrs[] = { #endif ATTR_LIST(sb_checksum), ATTR_LIST(casefold), + ATTR_LIST(readonly), #ifdef CONFIG_F2FS_FS_COMPRESSION ATTR_LIST(compression), #endif From e372e38d910ba24c442608174112d0c9b1e6af11 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Thu, 3 Jun 2021 09:50:37 +0000 Subject: [PATCH 263/580] f2fs: Show casefolding support only when supported The casefolding feature is only supported when CONFIG_UNICODE is set. This modifies the feature list f2fs presents under sysfs accordingly. Fixes: 5aba54302a46 ("f2fs: include charset encoding information in the superblock") Cc: stable@vger.kernel.org # v5.4+ Signed-off-by: Daniel Rosenberg Reviewed-by: Eric Biggers Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 1196e6be5fcc..9173566a21cb 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -728,7 +728,9 @@ F2FS_FEATURE_RO_ATTR(lost_found, FEAT_LOST_FOUND); F2FS_FEATURE_RO_ATTR(verity, FEAT_VERITY); #endif F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM); +#ifdef CONFIG_UNICODE F2FS_FEATURE_RO_ATTR(casefold, FEAT_CASEFOLD); +#endif F2FS_FEATURE_RO_ATTR(readonly, FEAT_RO); #ifdef CONFIG_F2FS_FS_COMPRESSION F2FS_FEATURE_RO_ATTR(compression, FEAT_COMPRESSION); @@ -838,7 +840,9 @@ static struct attribute *f2fs_feat_attrs[] = { ATTR_LIST(verity), #endif ATTR_LIST(sb_checksum), +#ifdef CONFIG_UNICODE ATTR_LIST(casefold), +#endif ATTR_LIST(readonly), #ifdef CONFIG_F2FS_FS_COMPRESSION ATTR_LIST(compression), From db637adba8b79984241998e0736923a8d45b4e23 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Thu, 3 Jun 2021 09:50:38 +0000 Subject: [PATCH 264/580] f2fs: Advertise encrypted casefolding in sysfs Older kernels don't support encryption with casefolding. This adds the sysfs entry encrypted_casefold to show support for those combined features. Support for this feature was originally added by commit 7ad08a58bf67 ("f2fs: Handle casefolding with Encryption") Fixes: 7ad08a58bf67 ("f2fs: Handle casefolding with Encryption") Cc: stable@vger.kernel.org # v5.11+ Signed-off-by: Daniel Rosenberg Reviewed-by: Eric Biggers Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 9173566a21cb..81ac539f91ee 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -586,6 +586,7 @@ enum feat_id { FEAT_COMPRESSION, FEAT_RO, FEAT_TEST_DUMMY_ENCRYPTION_V2, + FEAT_ENCRYPTED_CASEFOLD, }; static ssize_t f2fs_feature_show(struct f2fs_attr *a, @@ -608,6 +609,7 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a, case FEAT_COMPRESSION: case FEAT_RO: case FEAT_TEST_DUMMY_ENCRYPTION_V2: + case FEAT_ENCRYPTED_CASEFOLD: return sprintf(buf, "supported\n"); } return 0; @@ -712,7 +714,10 @@ F2FS_GENERAL_RO_ATTR(avg_vblocks); #ifdef CONFIG_FS_ENCRYPTION F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO); F2FS_FEATURE_RO_ATTR(test_dummy_encryption_v2, FEAT_TEST_DUMMY_ENCRYPTION_V2); +#ifdef CONFIG_UNICODE +F2FS_FEATURE_RO_ATTR(encrypted_casefold, FEAT_ENCRYPTED_CASEFOLD); #endif +#endif /* CONFIG_FS_ENCRYPTION */ #ifdef CONFIG_BLK_DEV_ZONED F2FS_FEATURE_RO_ATTR(block_zoned, FEAT_BLKZONED); #endif @@ -824,7 +829,10 @@ static struct attribute *f2fs_feat_attrs[] = { #ifdef CONFIG_FS_ENCRYPTION ATTR_LIST(encryption), ATTR_LIST(test_dummy_encryption_v2), +#ifdef CONFIG_UNICODE + ATTR_LIST(encrypted_casefold), #endif +#endif /* CONFIG_FS_ENCRYPTION */ #ifdef CONFIG_BLK_DEV_ZONED ATTR_LIST(block_zoned), #endif From bd2dc7d8e1614229cb7fad08a6dae671adc7f6ea Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 3 Jun 2021 22:30:09 -0700 Subject: [PATCH 265/580] f2fs: add pin_file in feature list This patch adds missing pin_file feature supported by kernel. Fixes: f5a53edcf01e ("f2fs: support aligned pinned file") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 81ac539f91ee..98aa98ce6689 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -743,6 +743,7 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_written_block, compr_written_block); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_saved_block, compr_saved_block); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_new_inode, compr_new_inode); #endif +F2FS_FEATURE_RO_ATTR(pin_file); /* For ATGC */ F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_candidate_ratio, candidate_ratio); @@ -855,6 +856,7 @@ static struct attribute *f2fs_feat_attrs[] = { #ifdef CONFIG_F2FS_FS_COMPRESSION ATTR_LIST(compression), #endif + ATTR_LIST(pin_file), NULL, }; From d5035ead19d22691e8f933fd0415e626dcb73c1f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 3 Jun 2021 12:31:08 -0700 Subject: [PATCH 266/580] f2fs: clean up /sys/fs/f2fs//features Let's create /sys/fs/f2fs//feature_list/ to meet sysfs rule. Note that there are three feature list entries: 1) /sys/fs/f2fs/features : shows runtime features supported by in-kernel f2fs along with Kconfig. - ref. F2FS_FEATURE_RO_ATTR() 2) /sys/fs/f2fs/$s_id/features : shows on-disk features enabled by mkfs.f2fs, used for old kernels. This won't add new feature anymore, and thus, users should check entries in 3) instead of this 2). 3) /sys/fs/f2fs/$s_id/feature_list : shows on-disk features enabled by mkfs.f2fs per instance, which follows sysfs entry rule where each entry should expose single value. This list covers old feature list provided by 2) and beyond. Therefore, please add new on-disk feature in this list only. - ref. F2FS_SB_FEATURE_RO_ATTR() Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 29 +++- fs/f2fs/f2fs.h | 3 + fs/f2fs/sysfs.c | 194 ++++++++++++++++-------- 3 files changed, 161 insertions(+), 65 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index d656e4af96c8..4d0e2f8883a7 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -193,7 +193,34 @@ Description: Shows total written kbytes issued to disk. What: /sys/fs/f2fs//features Date: July 2017 Contact: "Jaegeuk Kim" -Description: Shows all enabled features in current device. +Description: /feature_list/ + Shows all enabled features in current device. + Supported features: + encryption, blkzoned, extra_attr, projquota, inode_checksum, + flexible_inline_xattr, quota_ino, inode_crtime, lost_found, + verity, sb_checksum, casefold, readonly, compression, pin_file. + +What: /sys/fs/f2fs//feature_list/ +Date: June 2021 +Contact: "Jaegeuk Kim" +Description: Expand /sys/fs/f2fs//features to meet sysfs rule. + Supported on-disk features: + encryption, block_zoned (aka blkzoned), extra_attr, + project_quota (aka projquota), inode_checksum, + flexible_inline_xattr, quota_ino, inode_crtime, lost_found, + verity, sb_checksum, casefold, readonly, compression. + Note that, pin_file is moved into /sys/fs/f2fs/features/. + +What: /sys/fs/f2fs/features/ +Date: July 2017 +Contact: "Jaegeuk Kim" +Description: Shows all enabled kernel features. + Supported features: + encryption, block_zoned, extra_attr, project_quota, + inode_checksum, flexible_inline_xattr, quota_ino, + inode_crtime, lost_found, verity, sb_checksum, + casefold, readonly, compression, test_dummy_encryption_v2, + atomic_write, pin_file, encrypted_casefold. What: /sys/fs/f2fs//inject_rate Date: May 2016 diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 45456040c32e..fdf114bdbb18 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1670,6 +1670,9 @@ struct f2fs_sb_info { struct kobject s_stat_kobj; /* /sys/fs/f2fs//stat */ struct completion s_stat_kobj_unregister; + struct kobject s_feature_list_kobj; /* /sys/fs/f2fs//feature_list */ + struct completion s_feature_list_kobj_unregister; + /* For shrinker support */ struct list_head s_list; int s_ndevs; /* number of devices */ diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 98aa98ce6689..8e4b8431743a 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -569,50 +569,49 @@ static void f2fs_sb_release(struct kobject *kobj) complete(&sbi->s_kobj_unregister); } -enum feat_id { - FEAT_CRYPTO = 0, - FEAT_BLKZONED, - FEAT_ATOMIC_WRITE, - FEAT_EXTRA_ATTR, - FEAT_PROJECT_QUOTA, - FEAT_INODE_CHECKSUM, - FEAT_FLEXIBLE_INLINE_XATTR, - FEAT_QUOTA_INO, - FEAT_INODE_CRTIME, - FEAT_LOST_FOUND, - FEAT_VERITY, - FEAT_SB_CHECKSUM, - FEAT_CASEFOLD, - FEAT_COMPRESSION, - FEAT_RO, - FEAT_TEST_DUMMY_ENCRYPTION_V2, - FEAT_ENCRYPTED_CASEFOLD, -}; - +/* + * Note that there are three feature list entries: + * 1) /sys/fs/f2fs/features + * : shows runtime features supported by in-kernel f2fs along with Kconfig. + * - ref. F2FS_FEATURE_RO_ATTR() + * + * 2) /sys/fs/f2fs/$s_id/features + * : shows on-disk features enabled by mkfs.f2fs, used for old kernels. This + * won't add new feature anymore, and thus, users should check entries in 3) + * instead of this 2). + * + * 3) /sys/fs/f2fs/$s_id/feature_list + * : shows on-disk features enabled by mkfs.f2fs per instance, which follows + * sysfs entry rule where each entry should expose single value. + * This list covers old feature list provided by 2) and beyond. Therefore, + * please add new on-disk feature in this list only. + * - ref. F2FS_SB_FEATURE_RO_ATTR() + */ static ssize_t f2fs_feature_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - switch (a->id) { - case FEAT_CRYPTO: - case FEAT_BLKZONED: - case FEAT_ATOMIC_WRITE: - case FEAT_EXTRA_ATTR: - case FEAT_PROJECT_QUOTA: - case FEAT_INODE_CHECKSUM: - case FEAT_FLEXIBLE_INLINE_XATTR: - case FEAT_QUOTA_INO: - case FEAT_INODE_CRTIME: - case FEAT_LOST_FOUND: - case FEAT_VERITY: - case FEAT_SB_CHECKSUM: - case FEAT_CASEFOLD: - case FEAT_COMPRESSION: - case FEAT_RO: - case FEAT_TEST_DUMMY_ENCRYPTION_V2: - case FEAT_ENCRYPTED_CASEFOLD: + return sprintf(buf, "supported\n"); +} + +#define F2FS_FEATURE_RO_ATTR(_name) \ +static struct f2fs_attr f2fs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0444 }, \ + .show = f2fs_feature_show, \ +} + +static ssize_t f2fs_sb_feature_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + if (F2FS_HAS_FEATURE(sbi, a->id)) return sprintf(buf, "supported\n"); - } - return 0; + return sprintf(buf, "unsupported\n"); +} + +#define F2FS_SB_FEATURE_RO_ATTR(_name, _feat) \ +static struct f2fs_attr f2fs_attr_sb_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0444 }, \ + .show = f2fs_sb_feature_show, \ + .id = F2FS_FEATURE_##_feat, \ } #define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \ @@ -632,13 +631,6 @@ static struct f2fs_attr f2fs_attr_##_name = { \ #define F2FS_GENERAL_RO_ATTR(name) \ static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) -#define F2FS_FEATURE_RO_ATTR(_name, _id) \ -static struct f2fs_attr f2fs_attr_##_name = { \ - .attr = {.name = __stringify(_name), .mode = 0444 }, \ - .show = f2fs_feature_show, \ - .id = _id, \ -} - #define F2FS_STAT_ATTR(_struct_type, _struct_name, _name, _elname) \ static struct f2fs_attr f2fs_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = 0444 }, \ @@ -712,33 +704,33 @@ F2FS_GENERAL_RO_ATTR(avg_vblocks); #endif #ifdef CONFIG_FS_ENCRYPTION -F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO); -F2FS_FEATURE_RO_ATTR(test_dummy_encryption_v2, FEAT_TEST_DUMMY_ENCRYPTION_V2); +F2FS_FEATURE_RO_ATTR(encryption); +F2FS_FEATURE_RO_ATTR(test_dummy_encryption_v2); #ifdef CONFIG_UNICODE -F2FS_FEATURE_RO_ATTR(encrypted_casefold, FEAT_ENCRYPTED_CASEFOLD); +F2FS_FEATURE_RO_ATTR(encrypted_casefold); #endif #endif /* CONFIG_FS_ENCRYPTION */ #ifdef CONFIG_BLK_DEV_ZONED -F2FS_FEATURE_RO_ATTR(block_zoned, FEAT_BLKZONED); +F2FS_FEATURE_RO_ATTR(block_zoned); #endif -F2FS_FEATURE_RO_ATTR(atomic_write, FEAT_ATOMIC_WRITE); -F2FS_FEATURE_RO_ATTR(extra_attr, FEAT_EXTRA_ATTR); -F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA); -F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM); -F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR); -F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO); -F2FS_FEATURE_RO_ATTR(inode_crtime, FEAT_INODE_CRTIME); -F2FS_FEATURE_RO_ATTR(lost_found, FEAT_LOST_FOUND); +F2FS_FEATURE_RO_ATTR(atomic_write); +F2FS_FEATURE_RO_ATTR(extra_attr); +F2FS_FEATURE_RO_ATTR(project_quota); +F2FS_FEATURE_RO_ATTR(inode_checksum); +F2FS_FEATURE_RO_ATTR(flexible_inline_xattr); +F2FS_FEATURE_RO_ATTR(quota_ino); +F2FS_FEATURE_RO_ATTR(inode_crtime); +F2FS_FEATURE_RO_ATTR(lost_found); #ifdef CONFIG_FS_VERITY -F2FS_FEATURE_RO_ATTR(verity, FEAT_VERITY); +F2FS_FEATURE_RO_ATTR(verity); #endif -F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM); +F2FS_FEATURE_RO_ATTR(sb_checksum); #ifdef CONFIG_UNICODE -F2FS_FEATURE_RO_ATTR(casefold, FEAT_CASEFOLD); +F2FS_FEATURE_RO_ATTR(casefold); #endif -F2FS_FEATURE_RO_ATTR(readonly, FEAT_RO); +F2FS_FEATURE_RO_ATTR(readonly); #ifdef CONFIG_F2FS_FS_COMPRESSION -F2FS_FEATURE_RO_ATTR(compression, FEAT_COMPRESSION); +F2FS_FEATURE_RO_ATTR(compression); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_written_block, compr_written_block); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_saved_block, compr_saved_block); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_new_inode, compr_new_inode); @@ -866,6 +858,39 @@ static struct attribute *f2fs_stat_attrs[] = { NULL, }; +F2FS_SB_FEATURE_RO_ATTR(encryption, ENCRYPT); +F2FS_SB_FEATURE_RO_ATTR(block_zoned, BLKZONED); +F2FS_SB_FEATURE_RO_ATTR(extra_attr, EXTRA_ATTR); +F2FS_SB_FEATURE_RO_ATTR(project_quota, PRJQUOTA); +F2FS_SB_FEATURE_RO_ATTR(inode_checksum, INODE_CHKSUM); +F2FS_SB_FEATURE_RO_ATTR(flexible_inline_xattr, FLEXIBLE_INLINE_XATTR); +F2FS_SB_FEATURE_RO_ATTR(quota_ino, QUOTA_INO); +F2FS_SB_FEATURE_RO_ATTR(inode_crtime, INODE_CRTIME); +F2FS_SB_FEATURE_RO_ATTR(lost_found, LOST_FOUND); +F2FS_SB_FEATURE_RO_ATTR(verity, VERITY); +F2FS_SB_FEATURE_RO_ATTR(sb_checksum, SB_CHKSUM); +F2FS_SB_FEATURE_RO_ATTR(casefold, CASEFOLD); +F2FS_SB_FEATURE_RO_ATTR(compression, COMPRESSION); +F2FS_SB_FEATURE_RO_ATTR(readonly, RO); + +static struct attribute *f2fs_sb_feat_attrs[] = { + ATTR_LIST(sb_encryption), + ATTR_LIST(sb_block_zoned), + ATTR_LIST(sb_extra_attr), + ATTR_LIST(sb_project_quota), + ATTR_LIST(sb_inode_checksum), + ATTR_LIST(sb_flexible_inline_xattr), + ATTR_LIST(sb_quota_ino), + ATTR_LIST(sb_inode_crtime), + ATTR_LIST(sb_lost_found), + ATTR_LIST(sb_verity), + ATTR_LIST(sb_sb_checksum), + ATTR_LIST(sb_casefold), + ATTR_LIST(sb_compression), + ATTR_LIST(sb_readonly), + NULL, +}; + static const struct sysfs_ops f2fs_attr_ops = { .show = f2fs_attr_show, .store = f2fs_attr_store, @@ -932,6 +957,33 @@ static struct kobj_type f2fs_stat_ktype = { .release = f2fs_stat_kobj_release, }; +static ssize_t f2fs_sb_feat_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_feature_list_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static void f2fs_feature_list_kobj_release(struct kobject *kobj) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_feature_list_kobj); + complete(&sbi->s_feature_list_kobj_unregister); +} + +static const struct sysfs_ops f2fs_feature_list_attr_ops = { + .show = f2fs_sb_feat_attr_show, +}; + +static struct kobj_type f2fs_feature_list_ktype = { + .default_attrs = f2fs_sb_feat_attrs, + .sysfs_ops = &f2fs_feature_list_attr_ops, + .release = f2fs_feature_list_kobj_release, +}; + static int __maybe_unused segment_info_seq_show(struct seq_file *seq, void *offset) { @@ -1148,6 +1200,14 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) if (err) goto put_stat_kobj; + sbi->s_feature_list_kobj.kset = &f2fs_kset; + init_completion(&sbi->s_feature_list_kobj_unregister); + err = kobject_init_and_add(&sbi->s_feature_list_kobj, + &f2fs_feature_list_ktype, + &sbi->s_kobj, "feature_list"); + if (err) + goto put_feature_list_kobj; + if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); @@ -1162,6 +1222,9 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) victim_bits_seq_show, sb); } return 0; +put_feature_list_kobj: + kobject_put(&sbi->s_feature_list_kobj); + wait_for_completion(&sbi->s_feature_list_kobj_unregister); put_stat_kobj: kobject_put(&sbi->s_stat_kobj); wait_for_completion(&sbi->s_stat_kobj_unregister); @@ -1184,6 +1247,9 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) kobject_del(&sbi->s_stat_kobj); kobject_put(&sbi->s_stat_kobj); wait_for_completion(&sbi->s_stat_kobj_unregister); + kobject_del(&sbi->s_feature_list_kobj); + kobject_put(&sbi->s_feature_list_kobj); + wait_for_completion(&sbi->s_feature_list_kobj_unregister); kobject_del(&sbi->s_kobj); kobject_put(&sbi->s_kobj); From 86c0109573768336899c3d1cc9e6cdf68cf262c1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 20 May 2021 19:51:50 +0800 Subject: [PATCH 267/580] f2fs: compress: add compress_inode to cache compressed blocks Support to use address space of inner inode to cache compressed block, in order to improve cache hit ratio of random read. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 3 + fs/f2fs/compress.c | 168 ++++++++++++++++++++++++++++- fs/f2fs/data.c | 41 +++++-- fs/f2fs/debug.c | 13 +++ fs/f2fs/f2fs.h | 71 +++++++++++- fs/f2fs/gc.c | 1 + fs/f2fs/inode.c | 21 +++- fs/f2fs/node.c | 14 +++ fs/f2fs/node.h | 1 + fs/f2fs/segment.c | 6 +- fs/f2fs/super.c | 35 +++++- include/linux/f2fs_fs.h | 1 + 12 files changed, 361 insertions(+), 14 deletions(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 992bf91eeec8..809c4d0a696f 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -289,6 +289,9 @@ compress_mode=%s Control file compression mode. This supports "fs" and "user" choosing the target file and the timing. The user can do manual compression/decompression on the compression enabled files using ioctls. +compress_cache Support to use address space of a filesystem managed inode to + cache compressed block, in order to improve cache hit ratio of + random read. inlinecrypt When possible, encrypt/decrypt the contents of encrypted files using the blk-crypto framework rather than filesystem-layer encryption. This allows the use of diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 108e34aa40cd..a9fc6b73cf09 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -13,9 +13,11 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" +#include "segment.h" #include static struct kmem_cache *cic_entry_slab; @@ -707,7 +709,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc) return ret; } -static void f2fs_decompress_cluster(struct decompress_io_ctx *dic) +void f2fs_decompress_cluster(struct decompress_io_ctx *dic) { struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); struct f2fs_inode_info *fi = F2FS_I(dic->inode); @@ -806,7 +808,8 @@ static void f2fs_decompress_cluster(struct decompress_io_ctx *dic) * page being waited on in the cluster, and if so, it decompresses the cluster * (or in the case of a failure, cleans up without actually decompressing). */ -void f2fs_end_read_compressed_page(struct page *page, bool failed) +void f2fs_end_read_compressed_page(struct page *page, bool failed, + block_t blkaddr) { struct decompress_io_ctx *dic = (struct decompress_io_ctx *)page_private(page); @@ -816,6 +819,9 @@ void f2fs_end_read_compressed_page(struct page *page, bool failed) if (failed) WRITE_ONCE(dic->failed, true); + else if (blkaddr) + f2fs_cache_compressed_page(sbi, page, + dic->inode->i_ino, blkaddr); if (atomic_dec_and_test(&dic->remaining_pages)) f2fs_decompress_cluster(dic); @@ -1631,6 +1637,164 @@ void f2fs_put_page_dic(struct page *page) f2fs_put_dic(dic); } +const struct address_space_operations f2fs_compress_aops = { + .releasepage = f2fs_release_page, + .invalidatepage = f2fs_invalidate_page, +}; + +struct address_space *COMPRESS_MAPPING(struct f2fs_sb_info *sbi) +{ + return sbi->compress_inode->i_mapping; +} + +void f2fs_invalidate_compress_page(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + if (!sbi->compress_inode) + return; + invalidate_mapping_pages(COMPRESS_MAPPING(sbi), blkaddr, blkaddr); +} + +void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page, + nid_t ino, block_t blkaddr) +{ + struct page *cpage; + int ret; + + if (!test_opt(sbi, COMPRESS_CACHE)) + return; + + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ)) + return; + + if (!f2fs_available_free_memory(sbi, COMPRESS_PAGE)) + return; + + cpage = find_get_page(COMPRESS_MAPPING(sbi), blkaddr); + if (cpage) { + f2fs_put_page(cpage, 0); + return; + } + + cpage = alloc_page(__GFP_NOWARN | __GFP_IO); + if (!cpage) + return; + + ret = add_to_page_cache_lru(cpage, COMPRESS_MAPPING(sbi), + blkaddr, GFP_NOFS); + if (ret) { + f2fs_put_page(cpage, 0); + return; + } + + set_page_private_data(cpage, ino); + + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ)) + goto out; + + memcpy(page_address(cpage), page_address(page), PAGE_SIZE); + SetPageUptodate(cpage); +out: + f2fs_put_page(cpage, 1); +} + +bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, struct page *page, + block_t blkaddr) +{ + struct page *cpage; + bool hitted = false; + + if (!test_opt(sbi, COMPRESS_CACHE)) + return false; + + cpage = f2fs_pagecache_get_page(COMPRESS_MAPPING(sbi), + blkaddr, FGP_LOCK | FGP_NOWAIT, GFP_NOFS); + if (cpage) { + if (PageUptodate(cpage)) { + atomic_inc(&sbi->compress_page_hit); + memcpy(page_address(page), + page_address(cpage), PAGE_SIZE); + hitted = true; + } + f2fs_put_page(cpage, 1); + } + + return hitted; +} + +void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct address_space *mapping = sbi->compress_inode->i_mapping; + struct pagevec pvec; + pgoff_t index = 0; + pgoff_t end = MAX_BLKADDR(sbi); + + if (!mapping->nrpages) + return; + + pagevec_init(&pvec); + + do { + unsigned int nr_pages; + int i; + + nr_pages = pagevec_lookup_range(&pvec, mapping, + &index, end - 1); + if (!nr_pages) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (page->index > end) + break; + + lock_page(page); + if (page->mapping != mapping) { + unlock_page(page); + continue; + } + + if (ino != get_page_private_data(page)) { + unlock_page(page); + continue; + } + + generic_error_remove_page(mapping, page); + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + } while (index < end); +} + +int f2fs_init_compress_inode(struct f2fs_sb_info *sbi) +{ + struct inode *inode; + + if (!test_opt(sbi, COMPRESS_CACHE)) + return 0; + + inode = f2fs_iget(sbi->sb, F2FS_COMPRESS_INO(sbi)); + if (IS_ERR(inode)) + return PTR_ERR(inode); + sbi->compress_inode = inode; + + sbi->compress_percent = COMPRESS_PERCENT; + sbi->compress_watermark = COMPRESS_WATERMARK; + + atomic_set(&sbi->compress_page_hit, 0); + + return 0; +} + +void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi) +{ + if (!sbi->compress_inode) + return; + iput(sbi->compress_inode); + sbi->compress_inode = NULL; +} + int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 1cea40b33dfe..eb6e114ce9e2 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -130,7 +130,7 @@ static void f2fs_finish_read_bio(struct bio *bio) if (f2fs_is_compressed_page(page)) { if (bio->bi_status) - f2fs_end_read_compressed_page(page, true); + f2fs_end_read_compressed_page(page, true, 0); f2fs_put_page_dic(page); continue; } @@ -226,15 +226,19 @@ static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx) struct bio_vec *bv; int iter_all; bool all_compressed = true; + block_t blkaddr = SECTOR_TO_BLOCK(ctx->bio->bi_iter.bi_sector); bio_for_each_segment_all(bv, ctx->bio, iter_all) { struct page *page = bv->bv_page; /* PG_error was set if decryption failed. */ if (f2fs_is_compressed_page(page)) - f2fs_end_read_compressed_page(page, PageError(page)); + f2fs_end_read_compressed_page(page, PageError(page), + blkaddr); else all_compressed = false; + + blkaddr++; } /* @@ -1314,9 +1318,11 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) old_blkaddr = dn->data_blkaddr; f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr, &sum, seg_type, NULL); - if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) + if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) { invalidate_mapping_pages(META_MAPPING(sbi), old_blkaddr, old_blkaddr); + f2fs_invalidate_compress_page(sbi, old_blkaddr); + } f2fs_update_data_blkaddr(dn, dn->data_blkaddr); /* @@ -2135,7 +2141,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, goto out_put_dnode; } - for (i = 0; i < dic->nr_cpages; i++) { + for (i = 0; i < cc->nr_cpages; i++) { struct page *page = dic->cpages[i]; block_t blkaddr; struct bio_post_read_ctx *ctx; @@ -2143,6 +2149,14 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, blkaddr = data_blkaddr(dn.inode, dn.node_page, dn.ofs_in_node + i + 1); + f2fs_wait_on_block_writeback(inode, blkaddr); + + if (f2fs_load_compressed_page(sbi, page, blkaddr)) { + if (atomic_dec_and_test(&dic->remaining_pages)) + f2fs_decompress_cluster(dic); + continue; + } + if (bio && !page_is_mergeable(sbi, bio, *last_block_in_bio, blkaddr)) { submit_and_realloc: @@ -2163,8 +2177,6 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, } } - f2fs_wait_on_block_writeback(inode, blkaddr); - if (bio_add_page(bio, page, blocksize, 0) < blocksize) goto submit_and_realloc; @@ -3608,6 +3620,13 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, clear_page_private_gcing(page); + if (test_opt(sbi, COMPRESS_CACHE)) { + if (f2fs_compressed_file(inode)) + f2fs_invalidate_compress_pages(sbi, inode->i_ino); + if (inode->i_ino == F2FS_COMPRESS_INO(sbi)) + clear_page_private_data(page); + } + if (page_private_atomic(page)) return f2fs_drop_inmem_page(inode, page); @@ -3625,6 +3644,16 @@ int f2fs_release_page(struct page *page, gfp_t wait) if (page_private_atomic(page)) return 0; + if (test_opt(F2FS_P_SB(page), COMPRESS_CACHE)) { + struct f2fs_sb_info *sbi = F2FS_P_SB(page); + struct inode *inode = page->mapping->host; + + if (f2fs_compressed_file(inode)) + f2fs_invalidate_compress_pages(sbi, inode->i_ino); + if (inode->i_ino == F2FS_COMPRESS_INO(sbi)) + clear_page_private_data(page); + } + clear_page_private_gcing(page); detach_page_private(page); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index c03949a7ccff..833325038ef3 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -152,6 +152,12 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->node_pages = NODE_MAPPING(sbi)->nrpages; if (sbi->meta_inode) si->meta_pages = META_MAPPING(sbi)->nrpages; +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (sbi->compress_inode) { + si->compress_pages = COMPRESS_MAPPING(sbi)->nrpages; + si->compress_page_hit = atomic_read(&sbi->compress_page_hit); + } +#endif si->nats = NM_I(sbi)->nat_cnt[TOTAL_NAT]; si->dirty_nats = NM_I(sbi)->nat_cnt[DIRTY_NAT]; si->sits = MAIN_SEGS(sbi); @@ -309,6 +315,12 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->page_mem += (unsigned long long)npages << PAGE_SHIFT; } +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (sbi->compress_inode) { + unsigned npages = COMPRESS_MAPPING(sbi)->nrpages; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; + } +#endif } static int stat_show(struct seq_file *s, void *v) @@ -476,6 +488,7 @@ static int stat_show(struct seq_file *s, void *v) "volatile IO: %4d (Max. %4d)\n", si->inmem_pages, si->aw_cnt, si->max_aw_cnt, si->vw_cnt, si->max_vw_cnt); + seq_printf(s, " - compress: %4d, hit:%8d\n", si->compress_pages, si->compress_page_hit); seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fdf114bdbb18..979fa9bc1e59 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -97,6 +97,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_ATGC 0x08000000 #define F2FS_MOUNT_MERGE_CHECKPOINT 0x10000000 #define F2FS_MOUNT_GC_MERGE 0x20000000 +#define F2FS_MOUNT_COMPRESS_CACHE 0x40000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) @@ -1380,6 +1381,37 @@ PAGE_PRIVATE_CLEAR_FUNC(dummy, DUMMY_WRITE); #define DUMMY_ENCRYPTION_ENABLED(sbi) (0) #endif +static inline unsigned long get_page_private_data(struct page *page) +{ + unsigned long data = page_private(page); + + if (!test_bit(PAGE_PRIVATE_NOT_POINTER, &data)) + return 0; + return data >> PAGE_PRIVATE_MAX; +} + +static inline void set_page_private_data(struct page *page, unsigned long data) +{ + if (!PagePrivate(page)) { + get_page(page); + SetPagePrivate(page); + } + set_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)); + page_private(page) |= data << PAGE_PRIVATE_MAX; +} + +static inline void clear_page_private_data(struct page *page) +{ + page_private(page) &= (1 << PAGE_PRIVATE_MAX) - 1; + if (page_private(page) == 1 << PAGE_PRIVATE_NOT_POINTER) { + set_page_private(page, 0); + if (PagePrivate(page)) { + ClearPagePrivate(page); + put_page(page); + } + } +} + /* For compression */ enum compress_algorithm_type { COMPRESS_LZO, @@ -1393,6 +1425,9 @@ enum compress_flag { COMPRESS_MAX_FLAG, }; +#define COMPRESS_WATERMARK 20 +#define COMPRESS_PERCENT 20 + #define COMPRESS_DATA_RESERVED_SIZE 4 struct compress_data { __le32 clen; /* compressed data size */ @@ -1705,6 +1740,12 @@ struct f2fs_sb_info { u64 compr_written_block; u64 compr_saved_block; u32 compr_new_inode; + + /* For compressed block cache */ + struct inode *compress_inode; /* cache compressed blocks */ + unsigned int compress_percent; /* cache page percentage */ + unsigned int compress_watermark; /* cache page watermark */ + atomic_t compress_page_hit; /* cache hit count */ #endif }; @@ -3709,7 +3750,8 @@ struct f2fs_stat_info { unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; int rsvd_segs, overp_segs; - int dirty_count, node_pages, meta_pages; + int dirty_count, node_pages, meta_pages, compress_pages; + int compress_page_hit; int prefree_count, call_count, cp_count, bg_cp_count; int tot_segs, node_segs, data_segs, free_segs, free_secs; int bg_node_segs, bg_data_segs; @@ -4045,7 +4087,9 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page); bool f2fs_is_compress_backend_ready(struct inode *inode); int f2fs_init_compress_mempool(void); void f2fs_destroy_compress_mempool(void); -void f2fs_end_read_compressed_page(struct page *page, bool failed); +void f2fs_decompress_cluster(struct decompress_io_ctx *dic); +void f2fs_end_read_compressed_page(struct page *page, bool failed, + block_t blkaddr); bool f2fs_cluster_is_empty(struct compress_ctx *cc); bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index); void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page); @@ -4063,10 +4107,19 @@ void f2fs_put_page_dic(struct page *page); int f2fs_init_compress_ctx(struct compress_ctx *cc); void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse); void f2fs_init_compress_info(struct f2fs_sb_info *sbi); +int f2fs_init_compress_inode(struct f2fs_sb_info *sbi); +void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi); int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi); void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi); int __init f2fs_init_compress_cache(void); void f2fs_destroy_compress_cache(void); +struct address_space *COMPRESS_MAPPING(struct f2fs_sb_info *sbi); +void f2fs_invalidate_compress_page(struct f2fs_sb_info *sbi, block_t blkaddr); +void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page, + nid_t ino, block_t blkaddr); +bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, struct page *page, + block_t blkaddr); +void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino); #define inc_compr_inode_stat(inode) \ do { \ struct f2fs_sb_info *sbi = F2FS_I_SB(inode); \ @@ -4095,7 +4148,9 @@ static inline struct page *f2fs_compress_control_page(struct page *page) } static inline int f2fs_init_compress_mempool(void) { return 0; } static inline void f2fs_destroy_compress_mempool(void) { } -static inline void f2fs_end_read_compressed_page(struct page *page, bool failed) +static inline void f2fs_decompress_cluster(struct decompress_io_ctx *dic) { } +static inline void f2fs_end_read_compressed_page(struct page *page, + bool failed, block_t blkaddr) { WARN_ON_ONCE(1); } @@ -4103,10 +4158,20 @@ static inline void f2fs_put_page_dic(struct page *page) { WARN_ON_ONCE(1); } +static inline int f2fs_init_compress_inode(struct f2fs_sb_info *sbi) { return 0; } +static inline void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi) { } static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { } static inline int __init f2fs_init_compress_cache(void) { return 0; } static inline void f2fs_destroy_compress_cache(void) { } +static inline void f2fs_invalidate_compress_page(struct f2fs_sb_info *sbi, + block_t blkaddr) { } +static inline void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, + struct page *page, nid_t ino, block_t blkaddr) { } +static inline bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, + struct page *page, block_t blkaddr) { return false; } +static inline void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, + nid_t ino) { } #define inc_compr_inode_stat(inode) do { } while (0) #endif diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index ab1c0123904f..da5947b30142 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1261,6 +1261,7 @@ static int move_data_block(struct inode *inode, block_t bidx, f2fs_put_page(mpage, 1); invalidate_mapping_pages(META_MAPPING(fio.sbi), fio.old_blkaddr, fio.old_blkaddr); + f2fs_invalidate_compress_page(fio.sbi, fio.old_blkaddr); set_page_dirty(fio.encrypted_page); if (clear_page_dirty_for_io(fio.encrypted_page)) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 48a448fe700b..9e32c338052d 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -18,6 +18,10 @@ #include +#ifdef CONFIG_F2FS_FS_COMPRESSION +extern const struct address_space_operations f2fs_compress_aops; +#endif + void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync) { if (is_inode_flag_set(inode, FI_NEW_INODE)) @@ -494,6 +498,11 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi)) goto make_now; +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (ino == F2FS_COMPRESS_INO(sbi)) + goto make_now; +#endif + ret = do_read_inode(inode); if (ret) goto bad_inode; @@ -504,6 +513,12 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) } else if (ino == F2FS_META_INO(sbi)) { inode->i_mapping->a_ops = &f2fs_meta_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + } else if (ino == F2FS_COMPRESS_INO(sbi)) { +#ifdef CONFIG_F2FS_FS_COMPRESSION + inode->i_mapping->a_ops = &f2fs_compress_aops; +#endif + mapping_set_gfp_mask(inode->i_mapping, + GFP_NOFS | __GFP_HIGHMEM | __GFP_MOVABLE); } else if (S_ISREG(inode->i_mode)) { inode->i_op = &f2fs_file_inode_operations; inode->i_fop = &f2fs_file_operations; @@ -723,8 +738,12 @@ void f2fs_evict_inode(struct inode *inode) trace_f2fs_evict_inode(inode); truncate_inode_pages_final(&inode->i_data); + if (test_opt(sbi, COMPRESS_CACHE) && f2fs_compressed_file(inode)) + f2fs_invalidate_compress_pages(sbi, inode->i_ino); + if (inode->i_ino == F2FS_NODE_INO(sbi) || - inode->i_ino == F2FS_META_INO(sbi)) + inode->i_ino == F2FS_META_INO(sbi) || + inode->i_ino == F2FS_COMPRESS_INO(sbi)) goto out_clear; f2fs_bug_on(sbi, get_dirty_pages(inode)); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b5338f5f596e..4a7cd4b780eb 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -97,6 +97,20 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) mem_size = (atomic_read(&dcc->discard_cmd_cnt) * sizeof(struct discard_cmd)) >> PAGE_SHIFT; res = mem_size < (avail_ram * nm_i->ram_thresh / 100); + } else if (type == COMPRESS_PAGE) { +#ifdef CONFIG_F2FS_FS_COMPRESSION + unsigned long free_ram = val.freeram; + + /* + * free memory is lower than watermark or cached page count + * exceed threshold, deny caching compress page. + */ + res = (free_ram > avail_ram * sbi->compress_watermark / 100) && + (COMPRESS_MAPPING(sbi)->nrpages < + free_ram * sbi->compress_percent / 100); +#else + res = false; +#endif } else { if (!sbi->sb->s_bdi->wb.dirty_exceeded) return true; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index d85e8659cfda..84d45385d1f2 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -148,6 +148,7 @@ enum mem_type { EXTENT_CACHE, /* indicates extent cache */ INMEM_PAGES, /* indicates inmemory pages */ DISCARD_CACHE, /* indicates memory of cached discard cmds */ + COMPRESS_PAGE, /* indicates memory of cached compressed pages */ BASE_CHECK, /* check kernel status */ }; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0364f1526617..7687ebd7976f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2321,6 +2321,7 @@ void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) return; invalidate_mapping_pages(META_MAPPING(sbi), addr, addr); + f2fs_invalidate_compress_page(sbi, addr); /* add it into sit main buffer */ down_write(&sit_i->sentry_lock); @@ -3468,9 +3469,11 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) reallocate: f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, &fio->new_blkaddr, sum, type, fio); - if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) + if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) { invalidate_mapping_pages(META_MAPPING(fio->sbi), fio->old_blkaddr, fio->old_blkaddr); + f2fs_invalidate_compress_page(fio->sbi, fio->old_blkaddr); + } /* writeout dirty page into bdev */ f2fs_submit_page_write(fio); @@ -3660,6 +3663,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) { invalidate_mapping_pages(META_MAPPING(sbi), old_blkaddr, old_blkaddr); + f2fs_invalidate_compress_page(sbi, old_blkaddr); if (!from_gc) update_segment_mtime(sbi, old_blkaddr, 0); update_sit_entry(sbi, old_blkaddr, -1); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 35e4d46648dc..3983b8b8d4a4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -148,6 +148,7 @@ enum { Opt_compress_extension, Opt_compress_chksum, Opt_compress_mode, + Opt_compress_cache, Opt_atgc, Opt_gc_merge, Opt_nogc_merge, @@ -221,6 +222,7 @@ static match_table_t f2fs_tokens = { {Opt_compress_extension, "compress_extension=%s"}, {Opt_compress_chksum, "compress_chksum"}, {Opt_compress_mode, "compress_mode=%s"}, + {Opt_compress_cache, "compress_cache"}, {Opt_atgc, "atgc"}, {Opt_gc_merge, "gc_merge"}, {Opt_nogc_merge, "nogc_merge"}, @@ -1048,12 +1050,16 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) } kfree(name); break; + case Opt_compress_cache: + set_opt(sbi, COMPRESS_CACHE); + break; #else case Opt_compress_algorithm: case Opt_compress_log_size: case Opt_compress_extension: case Opt_compress_chksum: case Opt_compress_mode: + case Opt_compress_cache: f2fs_info(sbi, "compression options not supported"); break; #endif @@ -1403,6 +1409,8 @@ static void f2fs_put_super(struct super_block *sb) f2fs_bug_on(sbi, sbi->fsync_node_num); + f2fs_destroy_compress_inode(sbi); + iput(sbi->node_inode); sbi->node_inode = NULL; @@ -1670,6 +1678,9 @@ static inline void f2fs_show_compress_options(struct seq_file *seq, seq_printf(seq, ",compress_mode=%s", "fs"); else if (F2FS_OPTION(sbi).compress_mode == COMPR_MODE_USER) seq_printf(seq, ",compress_mode=%s", "user"); + + if (test_opt(sbi, COMPRESS_CACHE)) + seq_puts(seq, ",compress_cache"); } #endif @@ -1943,6 +1954,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool disable_checkpoint = test_opt(sbi, DISABLE_CHECKPOINT); bool no_io_align = !F2FS_IO_ALIGNED(sbi); bool no_atgc = !test_opt(sbi, ATGC); + bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE); bool checkpoint_changed; #ifdef CONFIG_QUOTA int i, j; @@ -2040,6 +2052,12 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } + if (no_compress_cache == !!test_opt(sbi, COMPRESS_CACHE)) { + err = -EINVAL; + f2fs_warn(sbi, "switch compress_cache option is not allowed"); + goto restore_opts; + } + if ((*flags & SB_RDONLY) && test_opt(sbi, DISABLE_CHECKPOINT)) { err = -EINVAL; f2fs_warn(sbi, "disabling checkpoint not compatible with read-only"); @@ -3922,10 +3940,14 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) goto free_node_inode; } - err = f2fs_register_sysfs(sbi); + err = f2fs_init_compress_inode(sbi); if (err) goto free_root_inode; + err = f2fs_register_sysfs(sbi); + if (err) + goto free_compress_inode; + #ifdef CONFIG_QUOTA /* Enable quota usage during mount */ if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sb)) { @@ -4055,6 +4077,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) /* evict some inodes being cached by GC */ evict_inodes(sb); f2fs_unregister_sysfs(sbi); +free_compress_inode: + f2fs_destroy_compress_inode(sbi); free_root_inode: dput(sb->s_root); sb->s_root = NULL; @@ -4133,6 +4157,15 @@ static void kill_f2fs_super(struct super_block *sb) f2fs_stop_gc_thread(sbi); f2fs_stop_discard_thread(sbi); +#ifdef CONFIG_F2FS_FS_COMPRESSION + /* + * latter evict_inode() can bypass checking and invalidating + * compress inode cache. + */ + if (test_opt(sbi, COMPRESS_CACHE)) + truncate_inode_pages_final(COMPRESS_MAPPING(sbi)); +#endif + if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) || !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { struct cp_control cpc = { diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index f93000c3a127..d445150c5350 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -34,6 +34,7 @@ #define F2FS_ROOT_INO(sbi) ((sbi)->root_ino_num) #define F2FS_NODE_INO(sbi) ((sbi)->node_ino_num) #define F2FS_META_INO(sbi) ((sbi)->meta_ino_num) +#define F2FS_COMPRESS_INO(sbi) (NM_I(sbi)->max_nid) #define F2FS_MAX_QUOTAS 3 From 17eeacbe1420332d16ffbc9e66063625388f3163 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 26 May 2021 14:29:26 +0800 Subject: [PATCH 268/580] f2fs: swap: remove dead codes After commit af4b6b8edf6a ("f2fs: introduce check_swap_activate_fast()"), we will never run into original logic of check_swap_activate() before f2fs supports non 4k-sized page, so let's delete those dead codes. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 168 +------------------------------------------------ 1 file changed, 1 insertion(+), 167 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index eb6e114ce9e2..763c6f06990e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3820,66 +3820,7 @@ int f2fs_migrate_page(struct address_space *mapping, #endif #ifdef CONFIG_SWAP -static int f2fs_is_file_aligned(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - block_t main_blkaddr = SM_I(sbi)->main_blkaddr; - block_t cur_lblock; - block_t last_lblock; - block_t pblock; - unsigned long nr_pblocks; - unsigned int blocks_per_sec = BLKS_PER_SEC(sbi); - unsigned int not_aligned = 0; - int ret = 0; - - cur_lblock = 0; - last_lblock = bytes_to_blks(inode, i_size_read(inode)); - - while (cur_lblock < last_lblock) { - struct f2fs_map_blocks map; - - memset(&map, 0, sizeof(map)); - map.m_lblk = cur_lblock; - map.m_len = last_lblock - cur_lblock; - map.m_next_pgofs = NULL; - map.m_next_extent = NULL; - map.m_seg_type = NO_CHECK_TYPE; - map.m_may_create = false; - - ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP); - if (ret) - goto out; - - /* hole */ - if (!(map.m_flags & F2FS_MAP_FLAGS)) { - f2fs_err(sbi, "Swapfile has holes"); - ret = -ENOENT; - goto out; - } - - pblock = map.m_pblk; - nr_pblocks = map.m_len; - - if ((pblock - main_blkaddr) & (blocks_per_sec - 1) || - nr_pblocks & (blocks_per_sec - 1)) { - if (f2fs_is_pinned_file(inode)) { - f2fs_err(sbi, "Swapfile does not align to section"); - ret = -EINVAL; - goto out; - } - not_aligned++; - } - - cur_lblock += nr_pblocks; - } - if (not_aligned) - f2fs_warn(sbi, "Swapfile (%u) is not align to section: 1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate()", - not_aligned); -out: - return ret; -} - -static int check_swap_activate_fast(struct swap_info_struct *sis, +static int check_swap_activate(struct swap_info_struct *sis, struct file *swap_file, sector_t *span) { struct address_space *mapping = swap_file->f_mapping; @@ -3974,113 +3915,6 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, return ret; } -/* Copied from generic_swapfile_activate() to check any holes */ -static int check_swap_activate(struct swap_info_struct *sis, - struct file *swap_file, sector_t *span) -{ - struct address_space *mapping = swap_file->f_mapping; - struct inode *inode = mapping->host; - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - unsigned blocks_per_page; - unsigned long page_no; - sector_t probe_block; - sector_t last_block; - sector_t lowest_block = -1; - sector_t highest_block = 0; - int nr_extents = 0; - int ret = 0; - - if (PAGE_SIZE == F2FS_BLKSIZE) - return check_swap_activate_fast(sis, swap_file, span); - - ret = f2fs_is_file_aligned(inode); - if (ret) - goto out; - - blocks_per_page = bytes_to_blks(inode, PAGE_SIZE); - - /* - * Map all the blocks into the extent list. This code doesn't try - * to be very smart. - */ - probe_block = 0; - page_no = 0; - last_block = bytes_to_blks(inode, i_size_read(inode)); - while ((probe_block + blocks_per_page) <= last_block && - page_no < sis->max) { - unsigned block_in_page; - sector_t first_block; - sector_t block = 0; - - cond_resched(); - - block = probe_block; - ret = bmap(inode, &block); - if (ret) - goto out; - if (!block) - goto bad_bmap; - first_block = block; - - /* - * It must be PAGE_SIZE aligned on-disk - */ - if (first_block & (blocks_per_page - 1)) { - probe_block++; - goto reprobe; - } - - for (block_in_page = 1; block_in_page < blocks_per_page; - block_in_page++) { - - block = probe_block + block_in_page; - ret = bmap(inode, &block); - if (ret) - goto out; - if (!block) - goto bad_bmap; - - if (block != first_block + block_in_page) { - /* Discontiguity */ - probe_block++; - goto reprobe; - } - } - - first_block >>= (PAGE_SHIFT - inode->i_blkbits); - if (page_no) { /* exclude the header page */ - if (first_block < lowest_block) - lowest_block = first_block; - if (first_block > highest_block) - highest_block = first_block; - } - - /* - * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks - */ - ret = add_swap_extent(sis, page_no, 1, first_block); - if (ret < 0) - goto out; - nr_extents += ret; - page_no++; - probe_block += blocks_per_page; -reprobe: - continue; - } - ret = nr_extents; - *span = 1 + highest_block - lowest_block; - if (page_no == 0) - page_no = 1; /* force Empty message */ - sis->max = page_no; - sis->pages = page_no - 1; - sis->highest_bit = page_no - 1; -out: - return ret; -bad_bmap: - f2fs_err(sbi, "Swapfile has holes"); - return -EINVAL; -} - static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, sector_t *span) { From d542430eb9080a908107a4d5a116819785179db3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 26 May 2021 14:29:27 +0800 Subject: [PATCH 269/580] f2fs: swap: support migrating swapfile in aligned write mode This patch supports to migrate swapfile in aligned write mode during swapon in order to keep swapfile being aligned to section as much as possible, then pinned swapfile will locates fully filled section which may not affected by GC. However, for the case that swapfile's size is not aligned to section size, it will still leave last extent in file's tail as unaligned due to its size is smaller than section size, like case #2. case #1 xfs_io -f /mnt/f2fs/file -c "pwrite 0 4M" -c "fsync" Before swapon: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..3047]: 1123352..1126399 3048 0x1000 1: [3048..7143]: 237568..241663 4096 0x1000 2: [7144..8191]: 245760..246807 1048 0x1001 After swapon: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..8191]: 249856..258047 8192 0x1001 Kmsg: F2FS-fs (zram0): Swapfile (2) is not align to section: 1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate(2097152 * n) case #2 xfs_io -f /mnt/f2fs/file -c "pwrite 0 3M" -c "fsync" Before swapon: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..3047]: 246808..249855 3048 0x1000 1: [3048..6143]: 237568..240663 3096 0x1001 After swapon: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..4095]: 258048..262143 4096 0x1000 1: [4096..6143]: 238616..240663 2048 0x1001 Kmsg: F2FS-fs (zram0): Swapfile: last extent is not aligned to section F2FS-fs (zram0): Swapfile (2) is not align to section: 1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate(2097152 * n) Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 108 ++++++++++++++++++++++++++++++++++++++++------ fs/f2fs/f2fs.h | 1 + fs/f2fs/node.h | 3 ++ fs/f2fs/segment.c | 3 ++ 4 files changed, 101 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 763c6f06990e..e34b6275ac17 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2460,6 +2460,10 @@ static inline bool check_inplace_update_policy(struct inode *inode, bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio) { + /* swap file is migrating in aligned write mode */ + if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) + return false; + if (f2fs_is_pinned_file(inode)) return true; @@ -2482,6 +2486,11 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) return true; if (f2fs_is_atomic_file(inode)) return true; + + /* swap file is migrating in aligned write mode */ + if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) + return true; + if (fio) { if (page_private_gcing(fio->page)) return true; @@ -3820,6 +3829,65 @@ int f2fs_migrate_page(struct address_space *mapping, #endif #ifdef CONFIG_SWAP +static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk, + unsigned int blkcnt) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned int blkofs; + unsigned int blk_per_sec = BLKS_PER_SEC(sbi); + unsigned int secidx = start_blk / blk_per_sec; + unsigned int end_sec = secidx + blkcnt / blk_per_sec; + int ret = 0; + + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); + + set_inode_flag(inode, FI_ALIGNED_WRITE); + + for (; secidx < end_sec; secidx++) { + down_write(&sbi->pin_sem); + + f2fs_lock_op(sbi); + f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false); + f2fs_unlock_op(sbi); + + set_inode_flag(inode, FI_DO_DEFRAG); + + for (blkofs = 0; blkofs < blk_per_sec; blkofs++) { + struct page *page; + unsigned int blkidx = secidx * blk_per_sec + blkofs; + + page = f2fs_get_lock_data_page(inode, blkidx, true); + if (IS_ERR(page)) { + up_write(&sbi->pin_sem); + ret = PTR_ERR(page); + goto done; + } + + set_page_dirty(page); + f2fs_put_page(page, 1); + } + + clear_inode_flag(inode, FI_DO_DEFRAG); + + ret = filemap_fdatawrite(inode->i_mapping); + + up_write(&sbi->pin_sem); + + if (ret) + break; + } + +done: + clear_inode_flag(inode, FI_DO_DEFRAG); + clear_inode_flag(inode, FI_ALIGNED_WRITE); + + up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + + return ret; +} + static int check_swap_activate(struct swap_info_struct *sis, struct file *swap_file, sector_t *span) { @@ -3833,7 +3901,8 @@ static int check_swap_activate(struct swap_info_struct *sis, sector_t highest_pblock = 0; int nr_extents = 0; unsigned long nr_pblocks; - unsigned int blocks_per_sec = BLKS_PER_SEC(sbi); + unsigned int blks_per_sec = BLKS_PER_SEC(sbi); + unsigned int sec_blks_mask = BLKS_PER_SEC(sbi) - 1; unsigned int not_aligned = 0; int ret = 0; @@ -3846,7 +3915,7 @@ static int check_swap_activate(struct swap_info_struct *sis, while (cur_lblock < last_lblock && cur_lblock < sis->max) { struct f2fs_map_blocks map; - +retry: cond_resched(); memset(&map, 0, sizeof(map)); @@ -3871,16 +3940,28 @@ static int check_swap_activate(struct swap_info_struct *sis, pblock = map.m_pblk; nr_pblocks = map.m_len; - if ((pblock - SM_I(sbi)->main_blkaddr) & (blocks_per_sec - 1) || - nr_pblocks & (blocks_per_sec - 1)) { - if (f2fs_is_pinned_file(inode)) { - f2fs_err(sbi, "Swapfile does not align to section"); - ret = -EINVAL; - goto out; - } + if ((pblock - SM_I(sbi)->main_blkaddr) & sec_blks_mask || + nr_pblocks & sec_blks_mask) { not_aligned++; - } + nr_pblocks = roundup(nr_pblocks, blks_per_sec); + if (cur_lblock + nr_pblocks > sis->max) + nr_pblocks -= blks_per_sec; + + if (!nr_pblocks) { + /* this extent is last one */ + nr_pblocks = map.m_len; + f2fs_warn(sbi, "Swapfile: last extent is not aligned to section"); + goto next; + } + + ret = f2fs_migrate_blocks(inode, cur_lblock, + nr_pblocks); + if (ret) + goto out; + goto retry; + } +next: if (cur_lblock + nr_pblocks >= sis->max) nr_pblocks = sis->max - cur_lblock; @@ -3907,11 +3988,10 @@ static int check_swap_activate(struct swap_info_struct *sis, sis->max = cur_lblock; sis->pages = cur_lblock - 1; sis->highest_bit = cur_lblock - 1; - - if (not_aligned) - f2fs_warn(sbi, "Swapfile (%u) is not align to section: 1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate()", - not_aligned); out: + if (not_aligned) + f2fs_warn(sbi, "Swapfile (%u) is not align to section: 1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate(%u * N)", + not_aligned, blks_per_sec * F2FS_BLKSIZE); return ret; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 979fa9bc1e59..f54389b3377a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -708,6 +708,7 @@ enum { FI_MMAP_FILE, /* indicate file was mmapped */ FI_ENABLE_COMPRESS, /* enable compression in "user" compression mode */ FI_COMPRESS_RELEASED, /* compressed blocks were released */ + FI_ALIGNED_WRITE, /* enable aligned write */ FI_MAX, /* max flag, never be used */ }; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 84d45385d1f2..ff14a6e5ac1c 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -38,6 +38,9 @@ /* return value for read_node_page */ #define LOCKED_PAGE 1 +/* check pinned file's alignment status of physical blocks */ +#define FILE_NOT_ALIGNED 1 + /* For flag in struct node_info */ enum { IS_CHECKPOINTED, /* is it checkpointed before? */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7687ebd7976f..c961aed4aa1b 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3290,6 +3290,9 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) if (fio->type == DATA) { struct inode *inode = fio->page->mapping->host; + if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) + return CURSEG_COLD_DATA_PINNED; + if (page_private_gcing(fio->page)) { if (fio->sbi->am.atgc_enabled && (fio->io_type == FS_DATA_IO) && From a64a156a46fd9b0ff4187289fec5ff9a7cb59ce0 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 11 Jun 2021 07:46:30 +0800 Subject: [PATCH 270/580] f2fs: introduce f2fs_casefolded_name slab cache Add a slab cache: "f2fs_casefolded_name" for memory allocation of casefold name. Reviewed-by: Eric Biggers Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 17 +++++++++++------ fs/f2fs/recovery.c | 6 +++++- fs/f2fs/super.c | 24 ++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index ae81155d288a..9ceb1b7c39c6 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -16,6 +16,10 @@ #include "xattr.h" #include +#ifdef CONFIG_UNICODE +extern struct kmem_cache *f2fs_cf_name_slab; +#endif + static unsigned long dir_blocks(struct inode *inode) { return ((unsigned long long) (i_size_read(inode) + PAGE_SIZE - 1)) @@ -77,11 +81,10 @@ int f2fs_init_casefolded_name(const struct inode *dir, { #ifdef CONFIG_UNICODE struct super_block *sb = dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); if (IS_CASEFOLDED(dir)) { - fname->cf_name.name = f2fs_kmalloc(sbi, F2FS_NAME_LEN, - GFP_NOFS); + fname->cf_name.name = kmem_cache_alloc(f2fs_cf_name_slab, + GFP_NOFS); if (!fname->cf_name.name) return -ENOMEM; fname->cf_name.len = utf8_casefold(sb->s_encoding, @@ -89,7 +92,7 @@ int f2fs_init_casefolded_name(const struct inode *dir, fname->cf_name.name, F2FS_NAME_LEN); if ((int)fname->cf_name.len <= 0) { - kfree(fname->cf_name.name); + kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name); fname->cf_name.name = NULL; if (sb_has_strict_encoding(sb)) return -EINVAL; @@ -172,8 +175,10 @@ void f2fs_free_filename(struct f2fs_filename *fname) fname->crypto_buf.name = NULL; #endif #ifdef CONFIG_UNICODE - kfree(fname->cf_name.name); - fname->cf_name.name = NULL; + if (fname->cf_name.name) { + kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name); + fname->cf_name.name = NULL; + } #endif } diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index d4259868c191..35034cfcf638 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -45,6 +45,10 @@ static struct kmem_cache *fsync_entry_slab; +#ifdef CONFIG_UNICODE +extern struct kmem_cache *f2fs_cf_name_slab; +#endif + bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi) { s64 nalloc = percpu_counter_sum_positive(&sbi->alloc_valid_block_count); @@ -145,7 +149,7 @@ static int init_recovered_filename(const struct inode *dir, f2fs_hash_filename(dir, fname); #ifdef CONFIG_UNICODE /* Case-sensitive match is fine for recovery */ - kfree(fname->cf_name.name); + kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name); fname->cf_name.name = NULL; #endif } else { diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3983b8b8d4a4..5071f5c7eca2 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -274,6 +274,24 @@ static int f2fs_sb_read_encoding(const struct f2fs_super_block *sb, return 0; } + +struct kmem_cache *f2fs_cf_name_slab; +static int __init f2fs_create_casefold_cache(void) +{ + f2fs_cf_name_slab = f2fs_kmem_cache_create("f2fs_casefolded_name", + F2FS_NAME_LEN); + if (!f2fs_cf_name_slab) + return -ENOMEM; + return 0; +} + +static void f2fs_destroy_casefold_cache(void) +{ + kmem_cache_destroy(f2fs_cf_name_slab); +} +#else +static int __init f2fs_create_casefold_cache(void) { return 0; } +static void f2fs_destroy_casefold_cache(void) { } #endif static inline void limit_reserve_root(struct f2fs_sb_info *sbi) @@ -4265,7 +4283,12 @@ static int __init init_f2fs_fs(void) err = f2fs_init_compress_cache(); if (err) goto free_compress_mempool; + err = f2fs_create_casefold_cache(); + if (err) + goto free_compress_cache; return 0; +free_compress_cache: + f2fs_destroy_compress_cache(); free_compress_mempool: f2fs_destroy_compress_mempool(); free_bioset: @@ -4301,6 +4324,7 @@ static int __init init_f2fs_fs(void) static void __exit exit_f2fs_fs(void) { + f2fs_destroy_casefold_cache(); f2fs_destroy_compress_cache(); f2fs_destroy_compress_mempool(); f2fs_destroy_bioset(); From 609a7a7ca2324af137b354c9c17c1604a690adf2 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 8 Jun 2021 07:31:22 +0800 Subject: [PATCH 271/580] f2fs: fix to avoid adding tab before doc section Otherwise whole section after tab will be invisible in compiled html format document. Cc: Mauro Carvalho Chehab Fixes: 89272ca1102e ("docs: filesystems: convert f2fs.txt to ReST") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 809c4d0a696f..b91e5a8444d5 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -720,10 +720,10 @@ users. ===================== ======================== =================== User F2FS Block ===================== ======================== =================== - META WRITE_LIFE_NOT_SET - HOT_NODE " - WARM_NODE " - COLD_NODE " +N/A META WRITE_LIFE_NOT_SET +N/A HOT_NODE " +N/A WARM_NODE " +N/A COLD_NODE " ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME extension list " " @@ -749,10 +749,10 @@ WRITE_LIFE_LONG " WRITE_LIFE_LONG ===================== ======================== =================== User F2FS Block ===================== ======================== =================== - META WRITE_LIFE_MEDIUM; - HOT_NODE WRITE_LIFE_NOT_SET - WARM_NODE " - COLD_NODE WRITE_LIFE_NONE +N/A META WRITE_LIFE_MEDIUM; +N/A HOT_NODE WRITE_LIFE_NOT_SET +N/A WARM_NODE " +N/A COLD_NODE WRITE_LIFE_NONE ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME extension list " " From 6d76a545816bd3c39aa9c8f759b4c6d761d78458 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Tue, 15 Jun 2021 15:39:04 -0700 Subject: [PATCH 272/580] f2fs: enable extent cache for compression files in read-only Let's allow extent cache for RO partition. Signed-off-by: Daeho Jeong Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f54389b3377a..cabc95557241 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3154,25 +3154,6 @@ static inline bool is_dot_dotdot(const u8 *name, size_t len) return false; } -static inline bool f2fs_may_extent_tree(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - - if (!test_opt(sbi, EXTENT_CACHE) || - is_inode_flag_set(inode, FI_NO_EXTENT) || - is_inode_flag_set(inode, FI_COMPRESSED_FILE)) - return false; - - /* - * for recovered files during mount do not create extents - * if shrinker is not registered. - */ - if (list_empty(&sbi->s_list)) - return false; - - return S_ISREG(inode->i_mode); -} - static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { @@ -4239,6 +4220,26 @@ F2FS_FEATURE_FUNCS(casefold, CASEFOLD); F2FS_FEATURE_FUNCS(compression, COMPRESSION); F2FS_FEATURE_FUNCS(readonly, RO); +static inline bool f2fs_may_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (!test_opt(sbi, EXTENT_CACHE) || + is_inode_flag_set(inode, FI_NO_EXTENT) || + (is_inode_flag_set(inode, FI_COMPRESSED_FILE) && + !f2fs_sb_has_readonly(sbi))) + return false; + + /* + * for recovered files during mount do not create extents + * if shrinker is not registered. + */ + if (list_empty(&sbi->s_list)) + return false; + + return S_ISREG(inode->i_mode); +} + #ifdef CONFIG_BLK_DEV_ZONED static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi, block_t blkaddr) From 3265f956e353ed6e1c27c137d279e06c3bf6e125 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 22 Jun 2021 12:56:44 -0700 Subject: [PATCH 273/580] f2fs: remove false alarm on iget failure during GC This patch removes setting SBI_NEED_FSCK when GC gets an error on f2fs_iget, since f2fs_iget can give ENOMEM and others by race condition. If we set this critical fsck flag, we'll get EIO during fsync via the below code path. In f2fs_inplace_write_data(), if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) || f2fs_cp_error(sbi)) { err = -EIO; goto drop_bio; } Fixes: 9557727876674 ("f2fs: drop inplace IO if fs status is abnormal") Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index da5947b30142..0e42ee5f7770 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1451,10 +1451,8 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (phase == 3) { inode = f2fs_iget(sb, dni.ino); - if (IS_ERR(inode) || is_bad_inode(inode)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); + if (IS_ERR(inode) || is_bad_inode(inode)) continue; - } if (!down_write_trylock( &F2FS_I(inode)->i_gc_rwsem[WRITE])) { From 38682b399a061832018c0605f3fce668bb151e5d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 23 Jun 2021 01:43:24 -0700 Subject: [PATCH 274/580] Revert "f2fs: avoid attaching SB_ACTIVE flag during mount/remount" This reverts commit d84ac50992774952fd56bb9ae3a16e554a171465. --- fs/f2fs/checkpoint.c | 3 +++ fs/f2fs/recovery.c | 8 ++++++-- fs/f2fs/super.c | 11 +++++++---- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 3497bac4c53d..23d36de40317 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -691,6 +691,9 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) } #ifdef CONFIG_QUOTA + /* Needed for iput() to work correctly and not trash data */ + sbi->sb->s_flags |= SB_ACTIVE; + /* * Turn on quotas which were not enabled for read-only mounts if * filesystem has quota feature, so that they are updated correctly. diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 35034cfcf638..e39371760be8 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -785,6 +785,8 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) } #ifdef CONFIG_QUOTA + /* Needed for iput() to work correctly and not trash data */ + sbi->sb->s_flags |= SB_ACTIVE; /* Turn on quotas so that they are updated correctly */ quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY); #endif @@ -812,8 +814,10 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) err = recover_data(sbi, &inode_list, &tmp_inode_list, &dir_list); if (!err) f2fs_bug_on(sbi, !list_empty(&inode_list)); - else - f2fs_bug_on(sbi, sbi->sb->s_flags & SB_ACTIVE); + else { + /* restore s_flags to let iput() trash data */ + sbi->sb->s_flags = s_flags; + } skip: destroy_fsync_dnodes(&inode_list, err); destroy_fsync_dnodes(&tmp_inode_list, err); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 5071f5c7eca2..102ddd0a5bac 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1892,15 +1892,17 @@ static int f2fs_enable_quotas(struct super_block *sb); static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) { + unsigned int s_flags = sbi->sb->s_flags; struct cp_control cpc; int err = 0; int ret; block_t unusable; - if (sbi->sb->s_flags & SB_RDONLY) { + if (s_flags & SB_RDONLY) { f2fs_err(sbi, "checkpoint=disable on readonly fs"); return -EINVAL; } + sbi->sb->s_flags |= SB_ACTIVE; f2fs_update_time(sbi, DISABLE_TIME); @@ -1918,13 +1920,13 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) ret = sync_filesystem(sbi->sb); if (ret || err) { err = ret ? ret : err; - goto out; + goto restore_flag; } unusable = f2fs_get_unusable_blocks(sbi); if (f2fs_disable_cp_again(sbi, unusable)) { err = -EAGAIN; - goto out; + goto restore_flag; } down_write(&sbi->gc_lock); @@ -1940,7 +1942,8 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) out_unlock: up_write(&sbi->gc_lock); -out: +restore_flag: + sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ return err; } From cba5450eb5fdebe8bf1d5a7121f58580c174b619 Mon Sep 17 00:00:00 2001 From: Fengnan Chang Date: Tue, 8 Jun 2021 19:15:08 +0800 Subject: [PATCH 275/580] f2fs: compress: add nocompress extensions support When we create a directory with enable compression, all file write into directory will try to compress.But sometimes we may know, new file cannot meet compression ratio requirements. We need a nocompress extension to skip those files to avoid unnecessary compress page test. After add nocompress_extension, the priority should be: dir_flag < comp_extention,nocompress_extension < comp_file_flag, no_comp_file_flag. Priority in between FS_COMPR_FL, FS_NOCOMP_FS, extensions: * compress_extension=so; nocompress_extension=zip; chattr +c dir; touch dir/foo.so; touch dir/bar.zip; touch dir/baz.txt; then foo.so and baz.txt should be compresse, bar.zip should be non-compressed. chattr +c dir/bar.zip can enable compress on bar.zip. * compress_extension=so; nocompress_extension=zip; chattr -c dir; touch dir/foo.so; touch dir/bar.zip; touch dir/baz.txt; then foo.so should be compresse, bar.zip and baz.txt should be non-compressed. chattr+c dir/bar.zip; chattr+c dir/baz.txt; can enable compress on bar.zip and baz.txt. Signed-off-by: Fengnan Chang Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 31 +++++++++++- fs/f2fs/f2fs.h | 2 + fs/f2fs/namei.c | 20 ++++++-- fs/f2fs/super.c | 79 +++++++++++++++++++++++++++++- 4 files changed, 125 insertions(+), 7 deletions(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index b91e5a8444d5..ff9e7cc97c65 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -281,6 +281,18 @@ compress_extension=%s Support adding specified extension, so that f2fs can enab For other files, we can still enable compression via ioctl. Note that, there is one reserved special extension '*', it can be set to enable compression for all files. +nocompress_extension=%s Support adding specified extension, so that f2fs can disable + compression on those corresponding files, just contrary to compression extension. + If you know exactly which files cannot be compressed, you can use this. + The same extension name can't appear in both compress and nocompress + extension at the same time. + If the compress extension specifies all files, the types specified by the + nocompress extension will be treated as special cases and will not be compressed. + Don't allow use '*' to specifie all file in nocompress extension. + After add nocompress_extension, the priority should be: + dir_flag < comp_extention,nocompress_extension < comp_file_flag,no_comp_file_flag. + See more in compression sections. + compress_chksum Support verifying chksum of raw data in compressed cluster. compress_mode=%s Control file compression mode. This supports "fs" and "user" modes. In "fs" mode (default), f2fs does automatic compression @@ -817,13 +829,30 @@ Compression implementation all logical blocks in cluster contain valid data and compress ratio of cluster data is lower than specified threshold. -- To enable compression on regular inode, there are three ways: +- To enable compression on regular inode, there are four ways: * chattr +c file * chattr +c dir; touch dir/file * mount w/ -o compress_extension=ext; touch file.ext * mount w/ -o compress_extension=*; touch any_file +- To disable compression on regular inode, there are two ways: + + * chattr -c file + * mount w/ -o nocompress_extension=ext; touch file.ext + +- Priority in between FS_COMPR_FL, FS_NOCOMP_FS, extensions: + + * compress_extension=so; nocompress_extension=zip; chattr +c dir; touch + dir/foo.so; touch dir/bar.zip; touch dir/baz.txt; then foo.so and baz.txt + should be compresse, bar.zip should be non-compressed. chattr +c dir/bar.zip + can enable compress on bar.zip. + * compress_extension=so; nocompress_extension=zip; chattr -c dir; touch + dir/foo.so; touch dir/bar.zip; touch dir/baz.txt; then foo.so should be + compresse, bar.zip and baz.txt should be non-compressed. + chattr+c dir/bar.zip; chattr+c dir/baz.txt; can enable compress on bar.zip + and baz.txt. + - At this point, compression feature doesn't expose compressed space to user directly in order to guarantee potential data updates later to the space. Instead, the main goal is to reduce data writes to flash disk as much as diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index cabc95557241..c1ff975ec7a7 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -150,8 +150,10 @@ struct f2fs_mount_info { unsigned char compress_level; /* compress level */ bool compress_chksum; /* compressed data chksum */ unsigned char compress_ext_cnt; /* extension count */ + unsigned char nocompress_ext_cnt; /* nocompress extension count */ int compress_mode; /* compression mode */ unsigned char extensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */ + unsigned char noextensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */ }; #define F2FS_FEATURE_ENCRYPT 0x0001 diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index dc782dc4d6da..a21144415604 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -282,14 +282,16 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode, const unsigned char *name) { __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; - unsigned char (*ext)[F2FS_EXTENSION_LEN]; - unsigned int ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + unsigned char (*noext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).noextensions; + unsigned char (*ext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).extensions; + unsigned char ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + unsigned char noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; int i, cold_count, hot_count; if (!f2fs_sb_has_compression(sbi) || - is_inode_flag_set(inode, FI_COMPRESSED_FILE) || F2FS_I(inode)->i_flags & F2FS_NOCOMP_FL || - !f2fs_may_compress(inode)) + !f2fs_may_compress(inode) || + (!ext_cnt && !noext_cnt)) return; down_read(&sbi->sb_lock); @@ -306,7 +308,15 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode, up_read(&sbi->sb_lock); - ext = F2FS_OPTION(sbi).extensions; + for (i = 0; i < noext_cnt; i++) { + if (is_extension_exist(name, noext[i], false)) { + f2fs_disable_compressed_file(inode); + return; + } + } + + if (is_inode_flag_set(inode, FI_COMPRESSED_FILE)) + return; for (i = 0; i < ext_cnt; i++) { if (!is_extension_exist(name, ext[i], false)) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 102ddd0a5bac..194a03b35a2b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -146,6 +146,7 @@ enum { Opt_compress_algorithm, Opt_compress_log_size, Opt_compress_extension, + Opt_nocompress_extension, Opt_compress_chksum, Opt_compress_mode, Opt_compress_cache, @@ -220,6 +221,7 @@ static match_table_t f2fs_tokens = { {Opt_compress_algorithm, "compress_algorithm=%s"}, {Opt_compress_log_size, "compress_log_size=%u"}, {Opt_compress_extension, "compress_extension=%s"}, + {Opt_nocompress_extension, "nocompress_extension=%s"}, {Opt_compress_chksum, "compress_chksum"}, {Opt_compress_mode, "compress_mode=%s"}, {Opt_compress_cache, "compress_cache"}, @@ -490,6 +492,43 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb, } #ifdef CONFIG_F2FS_FS_COMPRESSION +/* + * 1. The same extension name cannot not appear in both compress and non-compress extension + * at the same time. + * 2. If the compress extension specifies all files, the types specified by the non-compress + * extension will be treated as special cases and will not be compressed. + * 3. Don't allow the non-compress extension specifies all files. + */ +static int f2fs_test_compress_extension(struct f2fs_sb_info *sbi) +{ + unsigned char (*ext)[F2FS_EXTENSION_LEN]; + unsigned char (*noext)[F2FS_EXTENSION_LEN]; + int ext_cnt, noext_cnt, index = 0, no_index = 0; + + ext = F2FS_OPTION(sbi).extensions; + ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + noext = F2FS_OPTION(sbi).noextensions; + noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; + + if (!noext_cnt) + return 0; + + for (no_index = 0; no_index < noext_cnt; no_index++) { + if (!strcasecmp("*", noext[no_index])) { + f2fs_info(sbi, "Don't allow the nocompress extension specifies all files"); + return -EINVAL; + } + for (index = 0; index < ext_cnt; index++) { + if (!strcasecmp(ext[index], noext[no_index])) { + f2fs_info(sbi, "Don't allow the same extension %s appear in both compress and nocompress extension", + ext[index]); + return -EINVAL; + } + } + } + return 0; +} + #ifdef CONFIG_F2FS_FS_LZ4 static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str) { @@ -563,7 +602,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) substring_t args[MAX_OPT_ARGS]; #ifdef CONFIG_F2FS_FS_COMPRESSION unsigned char (*ext)[F2FS_EXTENSION_LEN]; - int ext_cnt; + unsigned char (*noext)[F2FS_EXTENSION_LEN]; + int ext_cnt, noext_cnt; #endif char *p, *name; int arg = 0; @@ -1051,6 +1091,30 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) F2FS_OPTION(sbi).compress_ext_cnt++; kfree(name); break; + case Opt_nocompress_extension: + if (!f2fs_sb_has_compression(sbi)) { + f2fs_info(sbi, "Image doesn't support compression"); + break; + } + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + + noext = F2FS_OPTION(sbi).noextensions; + noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; + + if (strlen(name) >= F2FS_EXTENSION_LEN || + noext_cnt >= COMPRESS_EXT_NUM) { + f2fs_err(sbi, + "invalid extension length/number"); + kfree(name); + return -EINVAL; + } + + strcpy(noext[noext_cnt], name); + F2FS_OPTION(sbi).nocompress_ext_cnt++; + kfree(name); + break; case Opt_compress_chksum: F2FS_OPTION(sbi).compress_chksum = true; break; @@ -1075,6 +1139,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) case Opt_compress_algorithm: case Opt_compress_log_size: case Opt_compress_extension: + case Opt_nocompress_extension: case Opt_compress_chksum: case Opt_compress_mode: case Opt_compress_cache: @@ -1129,6 +1194,13 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) } #endif +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_test_compress_extension(sbi)) { + f2fs_err(sbi, "invalid compress or nocompress extension"); + return -EINVAL; + } +#endif + if (F2FS_IO_SIZE_BITS(sbi) && !f2fs_lfs_mode(sbi)) { f2fs_err(sbi, "Should set mode=lfs with %uKB-sized IO", F2FS_IO_SIZE_KB(sbi)); @@ -1689,6 +1761,11 @@ static inline void f2fs_show_compress_options(struct seq_file *seq, F2FS_OPTION(sbi).extensions[i]); } + for (i = 0; i < F2FS_OPTION(sbi).nocompress_ext_cnt; i++) { + seq_printf(seq, ",nocompress_extension=%s", + F2FS_OPTION(sbi).noextensions[i]); + } + if (F2FS_OPTION(sbi).compress_chksum) seq_puts(seq, ",compress_chksum"); From b72ae94e88f3e1248d9e8c3218fff76de854585b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sun, 4 Jul 2021 22:11:25 -0700 Subject: [PATCH 276/580] f2fs: initialize page->private when using for our internal use We need to guarantee it's initially zero. Otherwise, it'll hurt entire flag operations. Fixes: b763f3bedc2d ("f2fs: restructure f2fs page.private layout") Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 ++ fs/f2fs/f2fs.h | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index e34b6275ac17..e0e058a90976 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3809,6 +3809,8 @@ int f2fs_migrate_page(struct address_space *mapping, get_page(newpage); } + /* guarantee to start from no stale private field */ + set_page_private(newpage, 0); if (PagePrivate(page)) { set_page_private(newpage, page_private(page)); SetPagePrivate(newpage); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c1ff975ec7a7..291164f693ee 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1330,7 +1330,8 @@ enum { #define PAGE_PRIVATE_GET_FUNC(name, flagname) \ static inline bool page_private_##name(struct page *page) \ { \ - return test_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)) && \ + return PagePrivate(page) && \ + test_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)) && \ test_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ } @@ -1340,6 +1341,7 @@ static inline void set_page_private_##name(struct page *page) \ if (!PagePrivate(page)) { \ get_page(page); \ SetPagePrivate(page); \ + set_page_private(page, 0); \ } \ set_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)); \ set_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ @@ -1398,6 +1400,7 @@ static inline void set_page_private_data(struct page *page, unsigned long data) if (!PagePrivate(page)) { get_page(page); SetPagePrivate(page); + set_page_private(page, 0); } set_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)); page_private(page) |= data << PAGE_PRIVATE_MAX; From 341b0015809777212de330e4802864e0cec28bfe Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 6 Jul 2021 22:05:06 -0700 Subject: [PATCH 277/580] f2fs: drop dirty node pages when cp is in error status Otherwise, writeback is going to fall in a loop to flush dirty inode forever before getting SBI_CLOSING. Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 4a7cd4b780eb..a67c301f2f49 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1549,13 +1549,10 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, trace_f2fs_writepage(page, NODE); if (unlikely(f2fs_cp_error(sbi))) { - if (is_sbi_flag_set(sbi, SBI_IS_CLOSE)) { - ClearPageUptodate(page); - dec_page_count(sbi, F2FS_DIRTY_NODES); - unlock_page(page); - return 0; - } - goto redirty_out; + ClearPageUptodate(page); + dec_page_count(sbi, F2FS_DIRTY_NODES); + unlock_page(page); + return 0; } if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) From 00908eecaff0279d8178d953fed807f0aa5b0a67 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Fri, 9 Jul 2021 22:53:57 -0700 Subject: [PATCH 278/580] f2fs: add sysfs nodes to get GC info for each GC mode Added gc_reclaimed_segments and gc_segment_mode sysfs nodes. 1) "gc_reclaimed_segments" shows how many segments have been reclaimed by GC during a specific GC mode. 2) "gc_segment_mode" is used to control for which gc mode the "gc_reclaimed_segments" node shows. Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 14 +++++++++++++ fs/f2fs/debug.c | 9 ++++++++ fs/f2fs/f2fs.h | 5 +++++ fs/f2fs/gc.c | 1 + fs/f2fs/sysfs.c | 28 +++++++++++++++++++++++++ 5 files changed, 57 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 4d0e2f8883a7..d3e441415ed0 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -473,3 +473,17 @@ Contact: "Chao Yu" Description: When ATGC is on, it controls age threshold to bypass GCing young candidates whose age is not beyond the threshold, by default it was initialized as 604800 seconds (equals to 7 days). + +What: /sys/fs/f2fs//gc_reclaimed_segments +Date: July 2021 +Contact: "Daeho Jeong" +Description: Show how many segments have been reclaimed by GC during a specific + GC mode (0: GC normal, 1: GC idle CB, 2: GC idle greedy, + 3: GC idle AT, 4: GC urgent high, 5: GC urgent low) + You can re-initialize this value to "0". + +What: /sys/fs/f2fs//gc_segment_mode +Date: July 2021 +Contact: "Daeho Jeong" +Description: You can control for which gc mode the "gc_reclaimed_segments" node shows. + Refer to the description of the modes in "gc_reclaimed_segments". diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 833325038ef3..53ed1e9191f0 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -450,6 +450,15 @@ static int stat_show(struct seq_file *s, void *v) si->data_segs, si->bg_data_segs); seq_printf(s, " - node segments : %d (%d)\n", si->node_segs, si->bg_node_segs); + seq_printf(s, " - Reclaimed segs : Normal (%d), Idle CB (%d), " + "Idle Greedy (%d), Idle AT (%d), " + "Urgent High (%d), Urgent Low (%d)\n", + si->sbi->gc_reclaimed_segs[GC_NORMAL], + si->sbi->gc_reclaimed_segs[GC_IDLE_CB], + si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY], + si->sbi->gc_reclaimed_segs[GC_IDLE_AT], + si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH], + si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]); seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks, si->bg_data_blks + si->bg_node_blks); seq_printf(s, " - data blocks : %d (%d)\n", si->data_blks, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 291164f693ee..c6b4f0221015 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1252,6 +1252,7 @@ enum { GC_IDLE_AT, GC_URGENT_HIGH, GC_URGENT_LOW, + MAX_GC_MODE, }; enum { @@ -1738,6 +1739,10 @@ struct f2fs_sb_info { struct kmem_cache *inline_xattr_slab; /* inline xattr entry */ unsigned int inline_xattr_slab_size; /* default inline xattr slab size */ + /* For reclaimed segs statistics per each GC mode */ + unsigned int gc_segment_mode; /* GC state for reclaimed segments */ + unsigned int gc_reclaimed_segs[MAX_GC_MODE]; /* Reclaimed segs for each mode */ + #ifdef CONFIG_F2FS_FS_COMPRESSION struct kmem_cache *page_array_slab; /* page array entry */ unsigned int page_array_slab_size; /* default page array slab size */ diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 0e42ee5f7770..d9511827dc83 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1646,6 +1646,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, force_migrate); stat_inc_seg_count(sbi, type, gc_type); + sbi->gc_reclaimed_segs[sbi->gc_mode]++; migrated++; freed: diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 8e4b8431743a..6aa72dae94df 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -310,6 +310,14 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, sbi->compr_new_inode); #endif + if (!strcmp(a->attr.name, "gc_segment_mode")) + return snprintf(buf, PAGE_SIZE, "%u\n", sbi->gc_segment_mode); + + if (!strcmp(a->attr.name, "gc_reclaimed_segments")) { + return snprintf(buf, PAGE_SIZE, "%u\n", + sbi->gc_reclaimed_segs[sbi->gc_segment_mode]); + } + ui = (unsigned int *)(ptr + a->offset); return sprintf(buf, "%u\n", *ui); @@ -518,6 +526,21 @@ static ssize_t __sbi_store(struct f2fs_attr *a, return count; } + if (!strcmp(a->attr.name, "gc_segment_mode")) { + if (t < MAX_GC_MODE) + sbi->gc_segment_mode = t; + else + return -EINVAL; + return count; + } + + if (!strcmp(a->attr.name, "gc_reclaimed_segments")) { + if (t != 0) + return -EINVAL; + sbi->gc_reclaimed_segs[sbi->gc_segment_mode] = 0; + return count; + } + *ui = (unsigned int)t; return count; @@ -743,6 +766,9 @@ F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_candidate_count, max_candidate_cou F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_weight, age_weight); F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_threshold, age_threshold); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_segment_mode, gc_segment_mode); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_reclaimed_segments, gc_reclaimed_segs); + #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { ATTR_LIST(gc_urgent_sleep_time), @@ -815,6 +841,8 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(atgc_candidate_count), ATTR_LIST(atgc_age_weight), ATTR_LIST(atgc_age_threshold), + ATTR_LIST(gc_segment_mode), + ATTR_LIST(gc_reclaimed_segments), NULL, }; From 5120acdc90922beba60dd1e40ff578da2695f8a3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 10 Jul 2021 08:21:41 +0800 Subject: [PATCH 279/580] f2fs: compress: fix to set zstd compress level correctly As 5kft reported in [1]: set_compress_context() should set compress level into .i_compress_flag for zstd as well as lz4hc, otherwise, zstd compressor will still use default zstd compress level during compression, fix it. [1] https://lore.kernel.org/linux-f2fs-devel/8e29f52b-6b0d-45ec-9520-e63eb254287a@www.fastmail.com/T/#u Fixes: 3fde13f817e2 ("f2fs: compress: support compress level") Reported-by: 5kft <5kft@5kft.org> Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c6b4f0221015..47a185dbbeb4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4180,7 +4180,8 @@ static inline void set_compress_context(struct inode *inode) 1 << COMPRESS_CHKSUM : 0; F2FS_I(inode)->i_cluster_size = 1 << F2FS_I(inode)->i_log_cluster_size; - if (F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 && + if ((F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 || + F2FS_I(inode)->i_compress_algorithm == COMPRESS_ZSTD) && F2FS_OPTION(sbi).compress_level) F2FS_I(inode)->i_compress_flag |= F2FS_OPTION(sbi).compress_level << From c80651d0d3f99ad4c1513ae14833de70ef986332 Mon Sep 17 00:00:00 2001 From: Wang Xiaojun Date: Fri, 9 Jul 2021 16:34:53 +0800 Subject: [PATCH 280/580] f2fs: avoid to create an empty string as the extension_list When creating a file, we need to set the temperature based on extension_list. If the empty string is a valid extension_list, the is_extension_exist will always returns true, which affects the separation of hot and cold. Signed-off-by: Wang Xiaojun Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 6aa72dae94df..61d67bdd35ce 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -354,7 +354,7 @@ static ssize_t __sbi_store(struct f2fs_attr *a, set = false; } - if (strlen(name) >= F2FS_EXTENSION_LEN) + if (!strlen(name) || strlen(name) >= F2FS_EXTENSION_LEN) return -EINVAL; down_write(&sbi->sb_lock); From b431fca2c4db4a69feb4845b9d182fa53f6c3e10 Mon Sep 17 00:00:00 2001 From: Jia Yang Date: Wed, 14 Jul 2021 15:46:06 +0800 Subject: [PATCH 281/580] f2fs: Revert "f2fs: Fix indefinite loop in f2fs_gc() v1" This reverts commit 957fa47823dfe449c5a15a944e4e7a299a6601db. The patch "f2fs: Fix indefinite loop in f2fs_gc()" v1 and v4 are all merged. Patch v4 is test info for patch v1. Patch v1 doesn't work and may cause that sbi->cur_victim_sec can't be resetted to NULL_SEGNO, which makes SSR unable to get segment of sbi->cur_victim_sec. So it should be reverted. The mails record: [1] https://lore.kernel.org/linux-f2fs-devel/7288dcd4-b168-7656-d1af-7e2cafa4f720@huawei.com/T/ [2] https://lore.kernel.org/linux-f2fs-devel/20190809153653.GD93481@jaegeuk-macbookpro.roam.corp.google.com/T/ Signed-off-by: Jia Yang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d9511827dc83..9dce44619069 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1748,7 +1748,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, round++; } - if (gc_type == FG_GC && seg_freed) + if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; if (sync) From 073a1afb6bb799c6c7b98d30785a22b34bdd7c6f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 14 Jul 2021 16:14:02 -0700 Subject: [PATCH 282/580] f2fs: let's keep writing IOs on SBI_NEED_FSCK SBI_NEED_FSCK is an indicator that fsck.f2fs needs to be triggered, so it is not fully critical to stop any IO writes. So, let's allow to write data instead of reporting EIO forever given SBI_NEED_FSCK, but do keep OPU. Fixes: 955772787667 ("f2fs: drop inplace IO if fs status is abnormal") Cc: # v5.13+ Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 ++ fs/f2fs/segment.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index e0e058a90976..e6731354e78d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2486,6 +2486,8 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) return true; if (f2fs_is_atomic_file(inode)) return true; + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) + return true; /* swap file is migrating in aligned write mode */ if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c961aed4aa1b..ad214ee72d76 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3562,7 +3562,7 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) goto drop_bio; } - if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) || f2fs_cp_error(sbi)) { + if (f2fs_cp_error(sbi)) { err = -EIO; goto drop_bio; } From b24f79b8e65d025b8733972c6266d442907af6ac Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 19 Jul 2021 16:46:47 +0800 Subject: [PATCH 283/580] f2fs: quota: fix potential deadlock xfstest generic/587 reports a deadlock issue as below: ====================================================== WARNING: possible circular locking dependency detected 5.14.0-rc1 #69 Not tainted ------------------------------------------------------ repquota/8606 is trying to acquire lock: ffff888022ac9320 (&sb->s_type->i_mutex_key#18){+.+.}-{3:3}, at: f2fs_quota_sync+0x207/0x300 [f2fs] but task is already holding lock: ffff8880084bcde8 (&sbi->quota_sem){.+.+}-{3:3}, at: f2fs_quota_sync+0x59/0x300 [f2fs] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (&sbi->quota_sem){.+.+}-{3:3}: __lock_acquire+0x648/0x10b0 lock_acquire+0x128/0x470 down_read+0x3b/0x2a0 f2fs_quota_sync+0x59/0x300 [f2fs] f2fs_quota_on+0x48/0x100 [f2fs] do_quotactl+0x5e3/0xb30 __x64_sys_quotactl+0x23a/0x4e0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae -> #1 (&sbi->cp_rwsem){++++}-{3:3}: __lock_acquire+0x648/0x10b0 lock_acquire+0x128/0x470 down_read+0x3b/0x2a0 f2fs_unlink+0x353/0x670 [f2fs] vfs_unlink+0x1c7/0x380 do_unlinkat+0x413/0x4b0 __x64_sys_unlinkat+0x50/0xb0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae -> #0 (&sb->s_type->i_mutex_key#18){+.+.}-{3:3}: check_prev_add+0xdc/0xb30 validate_chain+0xa67/0xb20 __lock_acquire+0x648/0x10b0 lock_acquire+0x128/0x470 down_write+0x39/0xc0 f2fs_quota_sync+0x207/0x300 [f2fs] do_quotactl+0xaff/0xb30 __x64_sys_quotactl+0x23a/0x4e0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae other info that might help us debug this: Chain exists of: &sb->s_type->i_mutex_key#18 --> &sbi->cp_rwsem --> &sbi->quota_sem Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&sbi->quota_sem); lock(&sbi->cp_rwsem); lock(&sbi->quota_sem); lock(&sb->s_type->i_mutex_key#18); *** DEADLOCK *** 3 locks held by repquota/8606: #0: ffff88801efac0e0 (&type->s_umount_key#53){++++}-{3:3}, at: user_get_super+0xd9/0x190 #1: ffff8880084bc380 (&sbi->cp_rwsem){++++}-{3:3}, at: f2fs_quota_sync+0x3e/0x300 [f2fs] #2: ffff8880084bcde8 (&sbi->quota_sem){.+.+}-{3:3}, at: f2fs_quota_sync+0x59/0x300 [f2fs] stack backtrace: CPU: 6 PID: 8606 Comm: repquota Not tainted 5.14.0-rc1 #69 Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 Call Trace: dump_stack_lvl+0xce/0x134 dump_stack+0x17/0x20 print_circular_bug.isra.0.cold+0x239/0x253 check_noncircular+0x1be/0x1f0 check_prev_add+0xdc/0xb30 validate_chain+0xa67/0xb20 __lock_acquire+0x648/0x10b0 lock_acquire+0x128/0x470 down_write+0x39/0xc0 f2fs_quota_sync+0x207/0x300 [f2fs] do_quotactl+0xaff/0xb30 __x64_sys_quotactl+0x23a/0x4e0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f883b0b4efe The root cause is ABBA deadlock of inode lock and cp_rwsem, reorder locks in f2fs_quota_sync() as below to fix this issue: - lock inode - lock cp_rwsem - lock quota_sem Fixes: db6ec53b7e03 ("f2fs: add a rw_sem to cover quota flag changes") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 88 ++++++++++++++++++++++++++++--------------------- 1 file changed, 50 insertions(+), 38 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 194a03b35a2b..64cd473cb00d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2505,6 +2505,33 @@ static int f2fs_enable_quotas(struct super_block *sb) return 0; } +static int f2fs_quota_sync_file(struct f2fs_sb_info *sbi, int type) +{ + struct quota_info *dqopt = sb_dqopt(sbi->sb); + struct address_space *mapping = dqopt->files[type]->i_mapping; + int ret = 0; + + ret = dquot_writeback_dquots(sbi->sb, type); + if (ret) + goto out; + + ret = filemap_fdatawrite(mapping); + if (ret) + goto out; + + /* if we are using journalled quota */ + if (is_journalled_quota(sbi)) + goto out; + + ret = filemap_fdatawait(mapping); + + truncate_inode_pages(&dqopt->files[type]->i_data, 0); +out: + if (ret) + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + return ret; +} + int f2fs_quota_sync(struct super_block *sb, int type) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -2512,57 +2539,42 @@ int f2fs_quota_sync(struct super_block *sb, int type) int cnt; int ret; - /* - * do_quotactl - * f2fs_quota_sync - * down_read(quota_sem) - * dquot_writeback_dquots() - * f2fs_dquot_commit - * block_operation - * down_read(quota_sem) - */ - f2fs_lock_op(sbi); - - down_read(&sbi->quota_sem); - ret = dquot_writeback_dquots(sb, type); - if (ret) - goto out; - /* * Now when everything is written we can discard the pagecache so * that userspace sees the changes. */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - struct address_space *mapping; if (type != -1 && cnt != type) continue; - if (!sb_has_quota_active(sb, cnt)) - continue; - mapping = dqopt->files[cnt]->i_mapping; - - ret = filemap_fdatawrite(mapping); - if (ret) - goto out; - - /* if we are using journalled quota */ - if (is_journalled_quota(sbi)) - continue; - - ret = filemap_fdatawait(mapping); - if (ret) - set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); + if (!sb_has_quota_active(sb, type)) + return 0; inode_lock(dqopt->files[cnt]); - truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); + + /* + * do_quotactl + * f2fs_quota_sync + * down_read(quota_sem) + * dquot_writeback_dquots() + * f2fs_dquot_commit + * block_operation + * down_read(quota_sem) + */ + f2fs_lock_op(sbi); + down_read(&sbi->quota_sem); + + ret = f2fs_quota_sync_file(sbi, cnt); + + up_read(&sbi->quota_sem); + f2fs_unlock_op(sbi); + inode_unlock(dqopt->files[cnt]); + + if (ret) + break; } -out: - if (ret) - set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); - up_read(&sbi->quota_sem); - f2fs_unlock_op(sbi); return ret; } From bdef9d8852b1d7d6da6e2b6195cf0f5083208e15 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 16 Jul 2021 09:39:11 -0500 Subject: [PATCH 284/580] f2fs: make f2fs_write_failed() take struct inode Make f2fs_write_failed() take a 'struct inode' directly rather than a 'struct address_space', as this simplifies it slightly. Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index e6731354e78d..05747d3ac667 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3168,9 +3168,8 @@ static int f2fs_write_data_pages(struct address_space *mapping, FS_CP_DATA_IO : FS_DATA_IO); } -static void f2fs_write_failed(struct address_space *mapping, loff_t to) +static void f2fs_write_failed(struct inode *inode, loff_t to) { - struct inode *inode = mapping->host; loff_t i_size = i_size_read(inode); if (IS_NOQUOTA(inode)) @@ -3402,7 +3401,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, fail: f2fs_put_page(page, 1); - f2fs_write_failed(mapping, pos + len); + f2fs_write_failed(inode, pos + len); if (drop_atomic) f2fs_drop_inmem_pages_all(sbi, false); return err; @@ -3592,7 +3591,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO, count - iov_iter_count(iter)); } else if (err < 0) { - f2fs_write_failed(mapping, offset + count); + f2fs_write_failed(inode, offset + count); } } else { if (err > 0) From 657bd8f81db97a8bf51c2a3516dff4158fcd65f4 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 16 Jul 2021 09:39:12 -0500 Subject: [PATCH 285/580] f2fs: remove allow_outplace_dio() We can just check f2fs_lfs_mode() directly. The block_unaligned_IO() check is redundant because in LFS mode, f2fs doesn't do direct I/O writes that aren't block-aligned (due to f2fs_force_buffered_io() returning true in this case, triggering the fallback to buffered I/O). Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 10 ---------- fs/f2fs/file.c | 2 +- 3 files changed, 2 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 05747d3ac667..dd2bce6ef8a7 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3543,7 +3543,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (f2fs_force_buffered_io(inode, iocb, iter)) return 0; - do_opu = allow_outplace_dio(inode, iocb, iter); + do_opu = rw == WRITE && f2fs_lfs_mode(sbi); trace_f2fs_direct_IO_enter(inode, offset, count, rw); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 47a185dbbeb4..5721e20353e3 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4365,16 +4365,6 @@ static inline int block_unaligned_IO(struct inode *inode, return align & blocksize_mask; } -static inline int allow_outplace_dio(struct inode *inode, - struct kiocb *iocb, struct iov_iter *iter) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - int rw = iov_iter_rw(iter); - - return (f2fs_lfs_mode(sbi) && (rw == WRITE) && - !block_unaligned_IO(inode, iocb, iter)); -} - static inline bool f2fs_force_buffered_io(struct inode *inode, struct kiocb *iocb, struct iov_iter *iter) { diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c57328c4e3f0..349a5e19172e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4445,7 +4445,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) * back to buffered IO. */ if (!f2fs_force_buffered_io(inode, iocb, from) && - allow_outplace_dio(inode, iocb, from)) + f2fs_lfs_mode(F2FS_I_SB(inode))) goto write; } preallocated = true; From 5f81864accc4c1f0a70f178abb06b35333c2a794 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 22 Jul 2021 10:30:58 -0700 Subject: [PATCH 286/580] f2fs: don't sleep while grabing nat_tree_lock This tries to fix priority inversion in the below condition resulting in long checkpoint delay. f2fs_get_node_info() - nat_tree_lock -> sleep to grab journal_rwsem by contention checkpoint - waiting for nat_tree_lock In order to let checkpoint go, let's release nat_tree_lock, if there's a journal_rwsem contention. Signed-off-by: Daeho Jeong Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index a67c301f2f49..e8fa6877af5a 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -552,7 +552,7 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, int i; ni->nid = nid; - +retry: /* Check nat cache */ down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); @@ -564,10 +564,19 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, return 0; } - memset(&ne, 0, sizeof(struct f2fs_nat_entry)); + /* + * Check current segment summary by trying to grab journal_rwsem first. + * This sem is on the critical path on the checkpoint requiring the above + * nat_tree_lock. Therefore, we should retry, if we failed to grab here + * while not bothering checkpoint. + */ + if (!rwsem_is_locked(&sbi->cp_global_sem)) { + down_read(&curseg->journal_rwsem); + } else if (!down_read_trylock(&curseg->journal_rwsem)) { + up_read(&nm_i->nat_tree_lock); + goto retry; + } - /* Check current segment summary */ - down_read(&curseg->journal_rwsem); i = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0); if (i >= 0) { ne = nat_in_journal(journal, i); From b72eb13a3d8e71ac45559cefe8ab46a38b7eef60 Mon Sep 17 00:00:00 2001 From: Fengnan Chang Date: Tue, 22 Jun 2021 19:50:59 +0800 Subject: [PATCH 287/580] f2fs: compress: remove unneeded read when rewrite whole cluster when we overwrite the whole page in cluster, we don't need read original data before write, because after write_end(), writepages() can help to load left data in that cluster. Signed-off-by: Fengnan Chang Signed-off-by: Chao Yu Acked-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index dd2bce6ef8a7..cdccef54c7ee 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3319,6 +3319,9 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, *fsdata = NULL; + if (len == PAGE_SIZE) + goto repeat; + ret = f2fs_prepare_compress_overwrite(inode, pagep, index, fsdata); if (ret < 0) { From 23b3ad006c95ce7c907a3ca24d3dd57f414eaa0a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 26 Jul 2021 09:12:15 -0700 Subject: [PATCH 288/580] f2fs: do not submit NEW_ADDR to read node block After the below patch, give cp is errored, we drop dirty node pages. This can give NEW_ADDR to read node pages. Don't do WARN_ON() which gives generic/475 failure. Fixes: 28607bf3aa6f ("f2fs: drop dirty node pages when cp is in error status") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index e8fa6877af5a..fae025d0392d 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1328,7 +1328,8 @@ static int read_node_page(struct page *page, int op_flags) if (err) return err; - if (unlikely(ni.blk_addr == NULL_ADDR) || + /* NEW_ADDR can be seen, after cp_error drops some dirty node pages */ + if (unlikely(ni.blk_addr == NULL_ADDR || ni.blk_addr == NEW_ADDR) || is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)) { ClearPageUptodate(page); return -ENOENT; From a1f2bc24415170d3bcdda727a0bb583e8a9851ee Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Sun, 25 Jul 2021 21:18:19 -0700 Subject: [PATCH 289/580] f2fs: change fiemap way in printing compression chunk When we print out a discontinuous compression chunk, it shows like a continuous chunk now. To show it more correctly, I've changed the way of printing fiemap info like below. Plus, eliminated NEW_ADDR(-1) in fiemap info, since it is not in fiemap user api manual. Let's assume 16KB compression cluster. Logical Physical Length Flags 0: 0000000000000000 00000002c091f000 0000000000004000 1008 1: 0000000000004000 00000002c0920000 0000000000004000 1008 ... 9: 0000000000034000 0000000f8c623000 0000000000004000 1008 10: 0000000000038000 000000101a6eb000 0000000000004000 1008 0: 0000000000000000 00000002c091f000 0000000000004000 1008 1: 0000000000004000 00000002c0920000 0000000000004000 1008 ... 9: 0000000000034000 0000000f8c623000 0000000000001000 1008 10: 0000000000035000 000000101a6ea000 0000000000003000 1008 11: 0000000000038000 000000101a6eb000 0000000000002000 1008 12: 000000000003a000 00000002c3544000 0000000000002000 1008 Flags 0x1000 => FIEMAP_EXTENT_MERGED 0x0008 => FIEMAP_EXTENT_ENCODED Signed-off-by: Daeho Jeong Tested-by: Eric Biggers Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 75 ++++++++++++++++++++++++++++---------------------- 1 file changed, 42 insertions(+), 33 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index cdccef54c7ee..03090e634a76 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1805,8 +1805,9 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 logical = 0, phys = 0, size = 0; u32 flags = 0; int ret = 0; - bool compr_cluster = false; + bool compr_cluster = false, compr_appended; unsigned int cluster_size = F2FS_I(inode)->i_cluster_size; + unsigned int count_in_cluster = 0; loff_t maxbytes; if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { @@ -1854,15 +1855,17 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, map.m_next_pgofs = &next_pgofs; map.m_seg_type = NO_CHECK_TYPE; - if (compr_cluster) - map.m_len = cluster_size - 1; + if (compr_cluster) { + map.m_lblk += 1; + map.m_len = cluster_size - count_in_cluster; + } ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP); if (ret) goto out; /* HOLE */ - if (!(map.m_flags & F2FS_MAP_FLAGS)) { + if (!compr_cluster && !(map.m_flags & F2FS_MAP_FLAGS)) { start_blk = next_pgofs; if (blks_to_bytes(inode, start_blk) < blks_to_bytes(inode, @@ -1872,6 +1875,14 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, flags |= FIEMAP_EXTENT_LAST; } + compr_appended = false; + /* In a case of compressed cluster, append this to the last extent */ + if (compr_cluster && ((map.m_flags & F2FS_MAP_UNWRITTEN) || + !(map.m_flags & F2FS_MAP_FLAGS))) { + compr_appended = true; + goto skip_fill; + } + if (size) { flags |= FIEMAP_EXTENT_MERGED; if (IS_ENCRYPTED(inode)) @@ -1888,39 +1899,37 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (start_blk > last_blk) goto out; - if (compr_cluster) { - compr_cluster = false; - - - logical = blks_to_bytes(inode, start_blk - 1); - phys = blks_to_bytes(inode, map.m_pblk); - size = blks_to_bytes(inode, cluster_size); - - flags |= FIEMAP_EXTENT_ENCODED; - - start_blk += cluster_size - 1; - - if (start_blk > last_blk) - goto out; - - goto prep_next; - } - +skip_fill: if (map.m_pblk == COMPRESS_ADDR) { compr_cluster = true; - start_blk++; - goto prep_next; + count_in_cluster = 1; + } else if (compr_appended) { + unsigned int appended_blks = cluster_size - + count_in_cluster + 1; + size += blks_to_bytes(inode, appended_blks); + start_blk += appended_blks; + compr_cluster = false; + } else { + logical = blks_to_bytes(inode, start_blk); + phys = __is_valid_data_blkaddr(map.m_pblk) ? + blks_to_bytes(inode, map.m_pblk) : 0; + size = blks_to_bytes(inode, map.m_len); + flags = 0; + + if (compr_cluster) { + flags = FIEMAP_EXTENT_ENCODED; + count_in_cluster += map.m_len; + if (count_in_cluster == cluster_size) { + compr_cluster = false; + size += blks_to_bytes(inode, 1); + } + } else if (map.m_flags & F2FS_MAP_UNWRITTEN) { + flags = FIEMAP_EXTENT_UNWRITTEN; + } + + start_blk += bytes_to_blks(inode, size); } - logical = blks_to_bytes(inode, start_blk); - phys = blks_to_bytes(inode, map.m_pblk); - size = blks_to_bytes(inode, map.m_len); - flags = 0; - if (map.m_flags & F2FS_MAP_UNWRITTEN) - flags = FIEMAP_EXTENT_UNWRITTEN; - - start_blk += bytes_to_blks(inode, size); - prep_next: cond_resched(); if (fatal_signal_pending(current)) From 7a6c4b08699594ca22eedbe42c11fe58e49d490c Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Wed, 28 Jul 2021 12:38:11 -0700 Subject: [PATCH 290/580] f2fs: turn back remapped address in compressed page endio Turned back the remmaped sector address to the address in the partition, when ending io, for compress cache to work properly. Fixes: 6ce19aff0b8c ("f2fs: compress: add compress_inode to cache compressed blocks") Signed-off-by: Daeho Jeong Signed-off-by: Youngjin Gil Signed-off-by: Hyeong Jun Kim Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 03090e634a76..34ada94f5701 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -114,6 +114,7 @@ struct bio_post_read_ctx { struct f2fs_sb_info *sbi; struct work_struct work; unsigned int enabled_steps; + block_t fs_blkaddr; }; static void f2fs_finish_read_bio(struct bio *bio) @@ -226,7 +227,7 @@ static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx) struct bio_vec *bv; int iter_all; bool all_compressed = true; - block_t blkaddr = SECTOR_TO_BLOCK(ctx->bio->bi_iter.bi_sector); + block_t blkaddr = ctx->fs_blkaddr; bio_for_each_segment_all(bv, ctx->bio, iter_all) { struct page *page = bv->bv_page; @@ -965,6 +966,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, ctx->bio = bio; ctx->sbi = sbi; ctx->enabled_steps = post_read_steps; + ctx->fs_blkaddr = blkaddr; bio->bi_private = ctx; } From 3cf880c946c48f2e3bfe8d7f0bc2e68393663863 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 30 Jul 2021 08:10:48 -0700 Subject: [PATCH 291/580] f2fs: show sbi status in debugfs/f2fs/status We need to get sbi->s_flag to understand the current f2fs status as well. One example is SBI_NEED_FSCK. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 53ed1e9191f0..473ad04d1891 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -333,11 +333,12 @@ static int stat_show(struct seq_file *s, void *v) list_for_each_entry(si, &f2fs_stat_list, stat_list) { update_general_status(si->sbi); - seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s]=====\n", + seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s (sbi: 0x%lx)]=====\n", si->sbi->sb->s_bdev, i++, f2fs_readonly(si->sbi->sb) ? "RO": "RW", is_set_ckpt_flags(si->sbi, CP_DISABLED_FLAG) ? - "Disabled": (f2fs_cp_error(si->sbi) ? "Error": "Good")); + "Disabled": (f2fs_cp_error(si->sbi) ? "Error": "Good"), + si->sbi->s_flag); seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ", si->sit_area_segs, si->nat_area_segs); seq_printf(s, "[SSA: %d] [MAIN: %d", From fc709bdf1ba0149f7b17ac31536841065e4fa7ea Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 29 Jul 2021 09:22:17 +0800 Subject: [PATCH 292/580] f2fs: fix wrong checkpoint_changed value in f2fs_remount() In f2fs_remount(), return value of test_opt() is an unsigned int type variable, however when we compare it to a bool type variable, it cause wrong result, fix it. Fixes: 4354994f097d ("f2fs: checkpoint disabling") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 64cd473cb00d..7759f61a628c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2049,11 +2049,10 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool need_restart_ckpt = false, need_stop_ckpt = false; bool need_restart_flush = false, need_stop_flush = false; bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); - bool disable_checkpoint = test_opt(sbi, DISABLE_CHECKPOINT); + bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT); bool no_io_align = !F2FS_IO_ALIGNED(sbi); bool no_atgc = !test_opt(sbi, ATGC); bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE); - bool checkpoint_changed; #ifdef CONFIG_QUOTA int i, j; #endif @@ -2098,8 +2097,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) err = parse_options(sb, data, true); if (err) goto restore_opts; - checkpoint_changed = - disable_checkpoint != test_opt(sbi, DISABLE_CHECKPOINT); /* * Previous and new state of filesystem is RO, @@ -2221,7 +2218,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) need_stop_flush = true; } - if (checkpoint_changed) { + if (enable_checkpoint == !!test_opt(sbi, DISABLE_CHECKPOINT)) { if (test_opt(sbi, DISABLE_CHECKPOINT)) { err = f2fs_disable_checkpoint(sbi); if (err) From e295ca68c872ca8068f05c4de4a07b4d064b0263 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 20 Jul 2021 09:03:29 +0800 Subject: [PATCH 293/580] f2fs: fix to force keeping write barrier for strict fsync mode [1] https://www.mail-archive.com/linux-f2fs-devel@lists.sourceforge.net/msg15126.html As [1] reported, if lower device doesn't support write barrier, in below case: - write page #0; persist - overwrite page #0 - fsync - write data page #0 OPU into device's cache - write inode page into device's cache - issue flush If SPO is triggered during flush command, inode page can be persisted before data page #0, so that after recovery, inode page can be recovered with new physical block address of data page #0, however there may contains dummy data in new physical block address. Then what user will see is: after overwrite & fsync + SPO, old data in file was corrupted, if any user do care about such case, we can suggest user to use STRICT fsync mode, in this mode, we will force to use atomic write sematics to keep write order in between data/node and last node, so that it avoids potential data corruption during fsync(). Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 349a5e19172e..91ebf8fc712d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -300,6 +300,18 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, f2fs_exist_written_data(sbi, ino, UPDATE_INO)) goto flush_out; goto out; + } else { + /* + * for OPU case, during fsync(), node can be persisted before + * data when lower device doesn't support write barrier, result + * in data corruption after SPO. + * So for strict fsync mode, force to use atomic write sematics + * to keep write order in between data/node and last node to + * avoid potential data corruption. + */ + if (F2FS_OPTION(sbi).fsync_mode == + FSYNC_MODE_STRICT && !atomic) + atomic = true; } go_write: /* From 9986d88f9831476d26e230ac941b7c989512771a Mon Sep 17 00:00:00 2001 From: Laibin Qiu Date: Sat, 31 Jul 2021 11:26:46 +0800 Subject: [PATCH 294/580] f2fs: fix min_seq_blocks can not make sense in some scenes. F2FS have dirty page count control for batched sequential write in writepages, and get the value of min_seq_blocks by blocks_per_seg * segs_per_sec(segs_per_sec defaults to 1). But in some scenes we set a lager section size, Min_seq_blocks will become too large to achieve the expected effect(eg. 4thread sequential write, the number of merge requests will be reduced). Signed-off-by: Laibin Qiu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ad214ee72d76..e7ca7e96beb1 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -4803,7 +4803,7 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi) sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; - sm_info->min_seq_blocks = sbi->blocks_per_seg * sbi->segs_per_sec; + sm_info->min_seq_blocks = sbi->blocks_per_seg; sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS; sm_info->min_ssr_sections = reserved_sections(sbi); From ee208707f69c27699e3a20c2a92447a8772a2f51 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 3 Aug 2021 08:15:43 +0800 Subject: [PATCH 295/580] f2fs: introduce discard_unit mount option As James Z reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=213877 [1.] One-line summary of the problem: Mount multiple SMR block devices exceed certain number cause system non-response [2.] Full description of the problem/report: Created some F2FS on SMR devices (mkfs.f2fs -m), then mounted in sequence. Each device is the same Model: HGST HSH721414AL (Size 14TB). Empirically, found that when the amount of SMR device * 1.5Gb > System RAM, the system ran out of memory and hung. No dmesg output. For example, 24 SMR Disk need 24*1.5GB = 36GB. A system with 32G RAM can only mount 21 devices, the 22nd device will be a reproducible cause of system hang. The number of SMR devices with other FS mounted on this system does not interfere with the result above. [3.] Keywords (i.e., modules, networking, kernel): F2FS, SMR, Memory [4.] Kernel information [4.1.] Kernel version (uname -a): Linux 5.13.4-200.fc34.x86_64 #1 SMP Tue Jul 20 20:27:29 UTC 2021 x86_64 x86_64 x86_64 GNU/Linux [4.2.] Kernel .config file: Default Fedora 34 with f2fs-tools-1.14.0-2.fc34.x86_64 [5.] Most recent kernel version which did not have the bug: None [6.] Output of Oops.. message (if applicable) with symbolic information resolved (see Documentation/admin-guide/oops-tracing.rst) None [7.] A small shell script or example program which triggers the problem (if possible) mount /dev/sdX /mnt/0X [8.] Memory consumption With 24 * 14T SMR Block device with F2FS free -g total used free shared buff/cache available Mem: 46 36 0 0 10 10 Swap: 0 0 0 With 3 * 14T SMR Block device with F2FS free -g total used free shared buff/cache available Mem: 7 5 0 0 1 1 Swap: 7 0 7 The root cause is, there are three bitmaps: - cur_valid_map - ckpt_valid_map - discard_map and each of them will cost ~500MB memory, {cur, ckpt}_valid_map are necessary, but discard_map is optional, since this bitmap will only be useful in mountpoint that small discard is enabled. For a blkzoned device such as SMR or ZNS devices, f2fs will only issue discard for a section(zone) when all blocks of that section are invalid, so, for such device, we don't need small discard functionality at all. This patch introduces a new mountoption "discard_unit=block|segment| section" to support issuing discard with different basic unit which is aligned to block, segment or section, so that user can specify "discard_unit=segment" or "discard_unit=section" to disable small discard functionality. Note that this mount option can not be changed by remount() due to related metadata need to be initialized during mount(). In order to save memory, let's use "discard_unit=section" for blkzoned device by default. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 8 +++ fs/f2fs/f2fs.h | 16 ++++++ fs/f2fs/segment.c | 82 +++++++++++++++++++----------- fs/f2fs/super.c | 54 ++++++++++++++++++-- fs/f2fs/sysfs.c | 2 + 5 files changed, 130 insertions(+), 32 deletions(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index ff9e7cc97c65..8f251a662542 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -312,6 +312,14 @@ inlinecrypt When possible, encrypt/decrypt the contents of encrypted Documentation/block/inline-encryption.rst. atgc Enable age-threshold garbage collection, it provides high effectiveness and efficiency on background GC. +discard_unit=%s Control discard unit, the argument can be "block", "segment" + and "section", issued discard command's offset/size will be + aligned to the unit, by default, "discard_unit=block" is set, + so that small discard functionality is enabled. + For blkzoned device, "discard_unit=section" will be set by + default, it is helpful for large sized SMR or ZNS devices to + reduce memory cost by getting rid of fs metadata supports small + discard. ======================== ============================================================ Debugfs Entries diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5721e20353e3..b194d58f2cbb 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -138,6 +138,11 @@ struct f2fs_mount_info { int fsync_mode; /* fsync policy */ int fs_mode; /* fs mode: LFS or ADAPTIVE */ int bggc_mode; /* bggc mode: off, on or sync */ + int discard_unit; /* + * discard command's offset/size should + * be aligned to this unit: block, + * segment or section + */ struct fscrypt_dummy_context dummy_enc_ctx; /* test dummy encryption */ block_t unusable_cap_perc; /* percentage for cap */ block_t unusable_cap; /* Amount of space allowed to be @@ -1298,6 +1303,12 @@ enum { */ }; +enum { + DISCARD_UNIT_BLOCK, /* basic discard unit is block */ + DISCARD_UNIT_SEGMENT, /* basic discard unit is segment */ + DISCARD_UNIT_SECTION, /* basic discard unit is section */ +}; + static inline int f2fs_test_bit(unsigned int nr, char *addr); static inline void f2fs_set_bit(unsigned int nr, char *addr); static inline void f2fs_clear_bit(unsigned int nr, char *addr); @@ -4419,6 +4430,11 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi) return false; } +static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi) +{ + return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK; +} + #define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e7ca7e96beb1..e6f443e933c7 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1892,7 +1892,8 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, se = get_seg_entry(sbi, GET_SEGNO(sbi, i)); offset = GET_BLKOFF_FROM_SEG0(sbi, i); - if (!f2fs_test_and_set_bit(offset, se->discard_map)) + if (f2fs_block_unit_discard(sbi) && + !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; } @@ -1917,7 +1918,8 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, struct list_head *head = &SM_I(sbi)->dcc_info->entry_list; int i; - if (se->valid_blocks == max_blocks || !f2fs_hw_support_discard(sbi)) + if (se->valid_blocks == max_blocks || !f2fs_hw_support_discard(sbi) || + !f2fs_block_unit_discard(sbi)) return false; if (!force) { @@ -2002,14 +2004,18 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, unsigned int start = 0, end = -1; unsigned int secno, start_segno; bool force = (cpc->reason & CP_DISCARD); - bool need_align = f2fs_lfs_mode(sbi) && __is_large_section(sbi); + bool section_alignment = F2FS_OPTION(sbi).discard_unit == + DISCARD_UNIT_SECTION; + + if (f2fs_lfs_mode(sbi) && __is_large_section(sbi)) + section_alignment = true; mutex_lock(&dirty_i->seglist_lock); while (1) { int i; - if (need_align && end != -1) + if (section_alignment && end != -1) end--; start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1); if (start >= MAIN_SEGS(sbi)) @@ -2017,7 +2023,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, end = find_next_zero_bit(prefree_map, MAIN_SEGS(sbi), start + 1); - if (need_align) { + if (section_alignment) { start = rounddown(start, sbi->segs_per_sec); end = roundup(end, sbi->segs_per_sec); } @@ -2055,6 +2061,9 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, } mutex_unlock(&dirty_i->seglist_lock); + if (!f2fs_block_unit_discard(sbi)) + goto wakeup; + /* send small discards */ list_for_each_entry_safe(entry, this, head, list) { unsigned int cur_pos = 0, next_pos, len, total_len = 0; @@ -2088,6 +2097,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, dcc->nr_discards -= total_len; } +wakeup: wake_up_discard_thread(sbi, false); } @@ -2107,6 +2117,11 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) return -ENOMEM; dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; + if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT) + dcc->discard_granularity = sbi->blocks_per_seg; + else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) + dcc->discard_granularity = BLKS_PER_SEC(sbi); + INIT_LIST_HEAD(&dcc->entry_list); for (i = 0; i < MAX_PLIST_NUM; i++) INIT_LIST_HEAD(&dcc->pend_list[i]); @@ -2254,7 +2269,8 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) del = 0; } - if (!f2fs_test_and_set_bit(offset, se->discard_map)) + if (f2fs_block_unit_discard(sbi) && + !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; /* @@ -2296,7 +2312,8 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) } } - if (f2fs_test_and_clear_bit(offset, se->discard_map)) + if (f2fs_block_unit_discard(sbi) && + f2fs_test_and_clear_bit(offset, se->discard_map)) sbi->discard_blks++; } if (!f2fs_test_bit(offset, se->ckpt_valid_map)) @@ -4281,6 +4298,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) unsigned int sit_segs, start; char *src_bitmap, *bitmap; unsigned int bitmap_size, main_bitmap_size, sit_bitmap_size; + unsigned int discard_map = f2fs_block_unit_discard(sbi) ? 1 : 0; /* allocate memory for SIT information */ sit_i = f2fs_kzalloc(sbi, sizeof(struct sit_info), GFP_KERNEL); @@ -4303,9 +4321,9 @@ static int build_sit_info(struct f2fs_sb_info *sbi) return -ENOMEM; #ifdef CONFIG_F2FS_CHECK_FS - bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * 4; + bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * (3 + discard_map); #else - bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * 3; + bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * (2 + discard_map); #endif sit_i->bitmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); if (!sit_i->bitmap) @@ -4325,8 +4343,10 @@ static int build_sit_info(struct f2fs_sb_info *sbi) bitmap += SIT_VBLOCK_MAP_SIZE; #endif - sit_i->sentries[start].discard_map = bitmap; - bitmap += SIT_VBLOCK_MAP_SIZE; + if (discard_map) { + sit_i->sentries[start].discard_map = bitmap; + bitmap += SIT_VBLOCK_MAP_SIZE; + } } sit_i->tmp_map = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); @@ -4488,17 +4508,19 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) if (IS_NODESEG(se->type)) total_node_blocks += se->valid_blocks; - /* build discard map only one time */ - if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { - memset(se->discard_map, 0xff, - SIT_VBLOCK_MAP_SIZE); - } else { - memcpy(se->discard_map, - se->cur_valid_map, - SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += - sbi->blocks_per_seg - - se->valid_blocks; + if (f2fs_block_unit_discard(sbi)) { + /* build discard map only one time */ + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, + SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, + se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += + sbi->blocks_per_seg - + se->valid_blocks; + } } if (__is_large_section(sbi)) @@ -4534,13 +4556,15 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) if (IS_NODESEG(se->type)) total_node_blocks += se->valid_blocks; - if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { - memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE); - } else { - memcpy(se->discard_map, se->cur_valid_map, - SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += old_valid_blocks; - sbi->discard_blks -= se->valid_blocks; + if (f2fs_block_unit_discard(sbi)) { + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += old_valid_blocks; + sbi->discard_blks -= se->valid_blocks; + } } if (__is_large_section(sbi)) { diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7759f61a628c..1402040c29c7 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -153,6 +153,7 @@ enum { Opt_atgc, Opt_gc_merge, Opt_nogc_merge, + Opt_discard_unit, Opt_err, }; @@ -228,6 +229,7 @@ static match_table_t f2fs_tokens = { {Opt_atgc, "atgc"}, {Opt_gc_merge, "gc_merge"}, {Opt_nogc_merge, "nogc_merge"}, + {Opt_discard_unit, "discard_unit=%s"}, {Opt_err, NULL}, }; @@ -1155,6 +1157,25 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) case Opt_nogc_merge: clear_opt(sbi, GC_MERGE); break; + case Opt_discard_unit: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + if (!strcmp(name, "block")) { + F2FS_OPTION(sbi).discard_unit = + DISCARD_UNIT_BLOCK; + } else if (!strcmp(name, "segment")) { + F2FS_OPTION(sbi).discard_unit = + DISCARD_UNIT_SEGMENT; + } else if (!strcmp(name, "section")) { + F2FS_OPTION(sbi).discard_unit = + DISCARD_UNIT_SECTION; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; default: f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value", p); @@ -1193,6 +1214,14 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) return -EINVAL; } #endif + if (f2fs_sb_has_blkzoned(sbi)) { + if (F2FS_OPTION(sbi).discard_unit != + DISCARD_UNIT_SECTION) { + f2fs_info(sbi, "Zoned block device doesn't need small discard, set discard_unit=section by default"); + F2FS_OPTION(sbi).discard_unit = + DISCARD_UNIT_SECTION; + } + } #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_test_compress_extension(sbi)) { @@ -1914,6 +1943,14 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(sbi, ATGC)) seq_puts(seq, ",atgc"); + + if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK) + seq_printf(seq, ",discard_unit=%s", "block"); + else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT) + seq_printf(seq, ",discard_unit=%s", "segment"); + else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) + seq_printf(seq, ",discard_unit=%s", "section"); + return 0; } @@ -1948,10 +1985,13 @@ static void default_options(struct f2fs_sb_info *sbi) sbi->sb->s_flags |= SB_LAZYTIME; set_opt(sbi, FLUSH_MERGE); set_opt(sbi, DISCARD); - if (f2fs_sb_has_blkzoned(sbi)) + if (f2fs_sb_has_blkzoned(sbi)) { F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS; - else + F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_SECTION; + } else { F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE; + F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_BLOCK; + } #ifdef CONFIG_F2FS_FS_XATTR set_opt(sbi, XATTR_USER); @@ -2053,6 +2093,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool no_io_align = !F2FS_IO_ALIGNED(sbi); bool no_atgc = !test_opt(sbi, ATGC); bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE); + bool block_unit_discard = f2fs_block_unit_discard(sbi); #ifdef CONFIG_QUOTA int i, j; #endif @@ -2153,6 +2194,12 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } + if (block_unit_discard != f2fs_block_unit_discard(sbi)) { + err = -EINVAL; + f2fs_warn(sbi, "switch discard_unit option is not allowed"); + goto restore_opts; + } + if ((*flags & SB_RDONLY) && test_opt(sbi, DISABLE_CHECKPOINT)) { err = -EINVAL; f2fs_warn(sbi, "disabling checkpoint not compatible with read-only"); @@ -3738,7 +3785,8 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) /* adjust parameters according to the volume size */ if (sm_i->main_segments <= SMALL_VOLUME_SEGMENTS) { F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; - sm_i->dcc_info->discard_granularity = 1; + if (f2fs_block_unit_discard(sbi)) + sm_i->dcc_info->discard_granularity = 1; sm_i->ipu_policy = 1 << F2FS_IPU_FORCE; } diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 61d67bdd35ce..a062d1edfbd1 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -431,6 +431,8 @@ static ssize_t __sbi_store(struct f2fs_attr *a, if (!strcmp(a->attr.name, "discard_granularity")) { if (t == 0 || t > MAX_PLIST_NUM) return -EINVAL; + if (!f2fs_block_unit_discard(sbi)) + return -EINVAL; if (t == *ui) return count; *ui = t; From da077153b5ad030b5621828586ff7a3387cbc1e2 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 4 Aug 2021 08:38:38 +0800 Subject: [PATCH 296/580] f2fs: fix to stop filesystem update once CP failed During f2fs_write_checkpoint(), once we failed in f2fs_flush_nat_entries() or do_checkpoint(), metadata of filesystem such as prefree bitmap, nat/sit version bitmap won't be recovered, it may cause f2fs image to be inconsistent, let's just set CP error flag to avoid further updates until we figure out a scheme to rollback all metadatas in such condition. Reported-by: Yangtao Li Signed-off-by: Yangtao Li Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 12 +++++++++--- fs/f2fs/f2fs.h | 2 +- fs/f2fs/segment.c | 15 +++++++++++++-- 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 23d36de40317..f6f3685ec7b8 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1639,8 +1639,11 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* write cached NAT/SIT entries to NAT/SIT area */ err = f2fs_flush_nat_entries(sbi, cpc); - if (err) + if (err) { + f2fs_err(sbi, "f2fs_flush_nat_entries failed err:%d, stop checkpoint", err); + f2fs_bug_on(sbi, !f2fs_cp_error(sbi)); goto stop; + } f2fs_flush_sit_entries(sbi, cpc); @@ -1648,10 +1651,13 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_save_inmem_curseg(sbi); err = do_checkpoint(sbi, cpc); - if (err) + if (err) { + f2fs_err(sbi, "do_checkpoint failed err:%d, stop checkpoint", err); + f2fs_bug_on(sbi, !f2fs_cp_error(sbi)); f2fs_release_discard_addrs(sbi); - else + } else { f2fs_clear_prefree_segments(sbi, cpc); + } f2fs_restore_inmem_curseg(sbi); stop: diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b194d58f2cbb..e15dbe9b06fa 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -546,7 +546,7 @@ enum { */ }; -#define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO count */ +#define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO or flush count */ /* congestion wait timeout value, default: 20ms */ #define DEFAULT_IO_TIMEOUT (msecs_to_jiffies(20)) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e6f443e933c7..5e20e53d8a06 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -776,11 +776,22 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi) return 0; for (i = 1; i < sbi->s_ndevs; i++) { + int count = DEFAULT_RETRY_IO_COUNT; + if (!f2fs_test_bit(i, (char *)&sbi->dirty_device)) continue; - ret = __submit_flush_wait(sbi, FDEV(i).bdev); - if (ret) + + do { + ret = __submit_flush_wait(sbi, FDEV(i).bdev); + if (ret) + congestion_wait(BLK_RW_ASYNC, + DEFAULT_IO_TIMEOUT); + } while (ret && --count); + + if (ret) { + f2fs_stop_checkpoint(sbi, false); break; + } spin_lock(&sbi->dev_lock); f2fs_clear_bit(i, (char *)&sbi->dirty_device); From 9c3b2840e111972a2fd5f44ef27854c6292a3fc2 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Wed, 4 Aug 2021 11:29:46 +0800 Subject: [PATCH 297/580] f2fs: reduce the scope of setting fsck tag when de->name_len is zero I recently found a case where de->name_len is 0 in f2fs_fill_dentries() easily reproduced, and finally set the fsck flag. Thread A Thread B - f2fs_readdir - f2fs_read_inline_dir - ctx->pos = d.max - f2fs_add_dentry - f2fs_add_inline_entry - do_convert_inline_dir - f2fs_add_regular_entry - f2fs_readdir - f2fs_fill_dentries - set_sbi_flag(sbi, SBI_NEED_FSCK) Process A opens the folder, and has been reading without closing it. During this period, Process B created a file under the folder (occupying multiple f2fs_dir_entry, exceeding the d.max of the inline dir). After creation, process A uses the d.max of inline dir to read it again, and it will read that de->name_len is 0. And Chao pointed out that w/o inline conversion, the race condition still can happen as below: dir_entry1: A dir_entry2: B dir_entry3: C free slot: _ ctx->pos: ^ Thread A is traversing directory, ctx-pos moves to below position after readdir() by thread A: AAAABBBB___ ^ Then thread B delete dir_entry2, and create dir_entry3. Thread A calls readdir() to lookup dirents starting from middle of new dirent slots as below: AAAACCCCCC_ ^ In these scenarios, the file system is not damaged, and it's hard to avoid it. But we can bypass tagging FSCK flag if: a) bit_pos (:= ctx->pos % d->max) is non-zero and b) before bit_pos moves to first valid dir_entry. Fixes: ddf06b753a85 ("f2fs: fix to trigger fsck if dirent.name_len is zero") Signed-off-by: Yangtao Li [Chao: clean up description] Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 9ceb1b7c39c6..90847b267826 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -1002,6 +1002,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode); struct blk_plug plug; bool readdir_ra = sbi->readdir_ra == 1; + bool found_valid_dirent = false; int err = 0; bit_pos = ((unsigned long)ctx->pos % d->max); @@ -1016,13 +1017,15 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, de = &d->dentry[bit_pos]; if (de->name_len == 0) { + if (found_valid_dirent || !bit_pos) { + printk_ratelimited( + "%sF2FS-fs (%s): invalid namelen(0), ino:%u, run fsck to fix.", + KERN_WARNING, sbi->sb->s_id, + le32_to_cpu(de->ino)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } bit_pos++; ctx->pos = start_pos + bit_pos; - printk_ratelimited( - "%sF2FS-fs (%s): invalid namelen(0), ino:%u, run fsck to fix.", - KERN_WARNING, sbi->sb->s_id, - le32_to_cpu(de->ino)); - set_sbi_flag(sbi, SBI_NEED_FSCK); continue; } @@ -1065,6 +1068,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, f2fs_ra_node_page(sbi, le32_to_cpu(de->ino)); ctx->pos = start_pos + bit_pos; + found_valid_dirent = true; } out: if (readdir_ra) From d4f8ebd75e112a614b15ceb2dac496edcea1fcad Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Thu, 5 Aug 2021 21:30:24 +0800 Subject: [PATCH 298/580] f2fs: Kconfig: clean up config options about compression In fs/f2fs/Kconfig, F2FS_FS_LZ4HC depends on F2FS_FS_LZ4 and F2FS_FS_LZ4 depends on F2FS_FS_COMPRESSION, so no need to make F2FS_FS_LZ4HC depends on F2FS_FS_COMPRESSION explicitly, remove the redudant "depends on", do the similar thing for F2FS_FS_LZORLE. At the same time, it is better to move F2FS_FS_LZORLE next to F2FS_FS_LZO, it looks like a little more clear when make menuconfig, the location of "LZO-RLE compression support" is under "LZO compression support" instead of "F2FS compression feature". Without this patch: F2FS compression feature LZO compression support LZ4 compression support LZ4HC compression support ZSTD compression support LZO-RLE compression support With this patch: F2FS compression feature LZO compression support LZO-RLE compression support LZ4 compression support LZ4HC compression support ZSTD compression support Signed-off-by: Tiezhu Yang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/Kconfig | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index db29224df9b3..0a66c0605866 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -113,6 +113,13 @@ config F2FS_FS_LZO help Support LZO compress algorithm, if unsure, say Y. +config F2FS_FS_LZORLE + bool "LZO-RLE compression support" + depends on F2FS_FS_LZO + default y + help + Support LZO-RLE compress algorithm, if unsure, say Y. + config F2FS_FS_LZ4 bool "LZ4 compression support" depends on F2FS_FS_COMPRESSION @@ -122,7 +129,6 @@ config F2FS_FS_LZ4 config F2FS_FS_LZ4HC bool "LZ4HC compression support" - depends on F2FS_FS_COMPRESSION depends on F2FS_FS_LZ4 default y help From 0a5d2eb60b1b06b89bead853552210a609696f18 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 4 Aug 2021 10:23:48 +0800 Subject: [PATCH 299/580] f2fs: extent cache: support unaligned extent Compressed inode may suffer read performance issue due to it can not use extent cache, so I propose to add this unaligned extent support to improve it. Currently, it only works in readonly format f2fs image. Unaligned extent: in one compressed cluster, physical block number will be less than logical block number, so we add an extra physical block length in extent info in order to indicate such extent status. The idea is if one whole cluster blocks are contiguous physically, once its mapping info was readed at first time, we will cache an unaligned (or aligned) extent info entry in extent cache, it expects that the mapping info will be hitted when rereading cluster. Merge policy: - Aligned extents can be merged. - Aligned extent and unaligned extent can not be merged. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 24 ++++++++++++++++++++++++ fs/f2fs/data.c | 38 +++++++++++++++++++++++++++----------- fs/f2fs/extent_cache.c | 41 +++++++++++++++++++++++++++++++++++++++++ fs/f2fs/f2fs.h | 20 ++++++++++++++++++++ fs/f2fs/node.c | 20 ++++++++++++++++++++ 5 files changed, 132 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index a9fc6b73cf09..bcec72f7f81b 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1637,6 +1637,30 @@ void f2fs_put_page_dic(struct page *page) f2fs_put_dic(dic); } +/* + * check whether cluster blocks are contiguous, and add extent cache entry + * only if cluster blocks are logically and physically contiguous. + */ +unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn) +{ + bool compressed = f2fs_data_blkaddr(dn) == COMPRESS_ADDR; + int i = compressed ? 1 : 0; + block_t first_blkaddr = data_blkaddr(dn->inode, dn->node_page, + dn->ofs_in_node + i); + + for (i += 1; i < F2FS_I(dn->inode)->i_cluster_size; i++) { + block_t blkaddr = data_blkaddr(dn->inode, dn->node_page, + dn->ofs_in_node + i); + + if (!__is_valid_data_blkaddr(blkaddr)) + break; + if (first_blkaddr + i - (compressed ? 1 : 0) != blkaddr) + return 0; + } + + return compressed ? i - 1 : i; +} + const struct address_space_operations f2fs_compress_aops = { .releasepage = f2fs_release_page, .invalidatepage = f2fs_invalidate_page, diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 34ada94f5701..fa0d776b3854 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1097,7 +1097,7 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) { - struct extent_info ei = {0, 0, 0}; + struct extent_info ei = {0, }; struct inode *inode = dn->inode; if (f2fs_lookup_extent_cache(inode, index, &ei)) { @@ -1114,7 +1114,7 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; struct page *page; - struct extent_info ei = {0,0,0}; + struct extent_info ei = {0, }; int err; page = f2fs_grab_cache_page(mapping, index, for_write); @@ -1412,7 +1412,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int err = 0, ofs = 1; unsigned int ofs_in_node, last_ofs_in_node; blkcnt_t prealloc; - struct extent_info ei = {0,0,0}; + struct extent_info ei = {0, }; block_t blkaddr; unsigned int start_pgofs; @@ -2087,6 +2087,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, sector_t last_block_in_file; const unsigned blocksize = blks_to_bytes(inode, 1); struct decompress_io_ctx *dic = NULL; + struct extent_info ei = {0, }; + bool from_dnode = true; int i; int ret = 0; @@ -2117,6 +2119,12 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, if (f2fs_cluster_is_empty(cc)) goto out; + if (f2fs_lookup_extent_cache(inode, start_idx, &ei)) + from_dnode = false; + + if (!from_dnode) + goto skip_reading_dnode; + set_new_dnode(&dn, inode, NULL, NULL, 0); ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); if (ret) @@ -2124,11 +2132,13 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, f2fs_bug_on(sbi, dn.data_blkaddr != COMPRESS_ADDR); +skip_reading_dnode: for (i = 1; i < cc->cluster_size; i++) { block_t blkaddr; - blkaddr = data_blkaddr(dn.inode, dn.node_page, - dn.ofs_in_node + i); + blkaddr = from_dnode ? data_blkaddr(dn.inode, dn.node_page, + dn.ofs_in_node + i) : + ei.blk + i - 1; if (!__is_valid_data_blkaddr(blkaddr)) break; @@ -2138,6 +2148,9 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, goto out_put_dnode; } cc->nr_cpages++; + + if (!from_dnode && i >= ei.c_len) + break; } /* nothing to decompress */ @@ -2157,8 +2170,9 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, block_t blkaddr; struct bio_post_read_ctx *ctx; - blkaddr = data_blkaddr(dn.inode, dn.node_page, - dn.ofs_in_node + i + 1); + blkaddr = from_dnode ? data_blkaddr(dn.inode, dn.node_page, + dn.ofs_in_node + i + 1) : + ei.blk + i; f2fs_wait_on_block_writeback(inode, blkaddr); @@ -2202,13 +2216,15 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, *last_block_in_bio = blkaddr; } - f2fs_put_dnode(&dn); + if (from_dnode) + f2fs_put_dnode(&dn); *bio_ret = bio; return 0; out_put_dnode: - f2fs_put_dnode(&dn); + if (from_dnode) + f2fs_put_dnode(&dn); out: for (i = 0; i < cc->cluster_size; i++) { if (cc->rpages[i]) { @@ -2531,7 +2547,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) struct page *page = fio->page; struct inode *inode = page->mapping->host; struct dnode_of_data dn; - struct extent_info ei = {0,0,0}; + struct extent_info ei = {0, }; struct node_info ni; bool ipu_force = false; int err = 0; @@ -3208,7 +3224,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, struct dnode_of_data dn; struct page *ipage; bool locked = false; - struct extent_info ei = {0,0,0}; + struct extent_info ei = {0, }; int err = 0; int flag; diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 3ebf976a682d..b120589d8517 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -661,6 +661,47 @@ static void f2fs_update_extent_tree_range(struct inode *inode, f2fs_mark_inode_dirty_sync(inode, true); } +#ifdef CONFIG_F2FS_FS_COMPRESSION +void f2fs_update_extent_tree_range_compressed(struct inode *inode, + pgoff_t fofs, block_t blkaddr, unsigned int llen, + unsigned int c_len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_node *en = NULL; + struct extent_node *prev_en = NULL, *next_en = NULL; + struct extent_info ei; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + bool leftmost = false; + + trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, llen); + + /* it is safe here to check FI_NO_EXTENT w/o et->lock in ro image */ + if (is_inode_flag_set(inode, FI_NO_EXTENT)) + return; + + write_lock(&et->lock); + + en = (struct extent_node *)f2fs_lookup_rb_tree_ret(&et->root, + (struct rb_entry *)et->cached_en, fofs, + (struct rb_entry **)&prev_en, + (struct rb_entry **)&next_en, + &insert_p, &insert_parent, false, + &leftmost); + if (en) + goto unlock_out; + + set_extent_info(&ei, fofs, blkaddr, llen); + ei.c_len = c_len; + + if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) + __insert_extent_tree(sbi, et, &ei, + insert_p, insert_parent, leftmost); +unlock_out: + write_unlock(&et->lock); +} +#endif + unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) { struct extent_tree *et, *next; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e15dbe9b06fa..400b22d09394 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -579,6 +579,9 @@ struct extent_info { unsigned int fofs; /* start offset in a file */ unsigned int len; /* length of the extent */ u32 blk; /* start block address of the extent */ +#ifdef CONFIG_F2FS_FS_COMPRESSION + unsigned int c_len; /* physical extent length of compressed blocks */ +#endif }; struct extent_node { @@ -799,6 +802,9 @@ static inline void set_extent_info(struct extent_info *ei, unsigned int fofs, ei->fofs = fofs; ei->blk = blk; ei->len = len; +#ifdef CONFIG_F2FS_FS_COMPRESSION + ei->c_len = 0; +#endif } static inline bool __is_discard_mergeable(struct discard_info *back, @@ -823,6 +829,12 @@ static inline bool __is_discard_front_mergeable(struct discard_info *cur, static inline bool __is_extent_mergeable(struct extent_info *back, struct extent_info *front) { +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (back->c_len && back->len != back->c_len) + return false; + if (front->c_len && front->len != front->c_len) + return false; +#endif return (back->fofs + back->len == front->fofs && back->blk + back->len == front->blk); } @@ -4101,12 +4113,16 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, struct writeback_control *wbc, enum iostat_type io_type); int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index); +void f2fs_update_extent_tree_range_compressed(struct inode *inode, + pgoff_t fofs, block_t blkaddr, unsigned int llen, + unsigned int c_len); int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, unsigned nr_pages, sector_t *last_block_in_bio, bool is_readahead, bool for_write); struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc); void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed); void f2fs_put_page_dic(struct page *page); +unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn); int f2fs_init_compress_ctx(struct compress_ctx *cc); void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse); void f2fs_init_compress_info(struct f2fs_sb_info *sbi); @@ -4161,6 +4177,7 @@ static inline void f2fs_put_page_dic(struct page *page) { WARN_ON_ONCE(1); } +static inline unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn) { return 0; } static inline int f2fs_init_compress_inode(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi) { } static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return 0; } @@ -4176,6 +4193,9 @@ static inline bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, static inline void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) { } #define inc_compr_inode_stat(inode) do { } while (0) +static inline void f2fs_update_extent_tree_range_compressed(struct inode *inode, + pgoff_t fofs, block_t blkaddr, unsigned int llen, + unsigned int c_len) { } #endif static inline void set_compress_context(struct inode *inode) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index fae025d0392d..b01ff7d499a7 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -839,6 +839,26 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) dn->ofs_in_node = offset[level]; dn->node_page = npage[level]; dn->data_blkaddr = f2fs_data_blkaddr(dn); + + if (is_inode_flag_set(dn->inode, FI_COMPRESSED_FILE) && + f2fs_sb_has_readonly(sbi)) { + unsigned int c_len = f2fs_cluster_blocks_are_contiguous(dn); + block_t blkaddr; + + if (!c_len) + goto out; + + blkaddr = f2fs_data_blkaddr(dn); + if (blkaddr == COMPRESS_ADDR) + blkaddr = data_blkaddr(dn->inode, dn->node_page, + dn->ofs_in_node + 1); + + f2fs_update_extent_tree_range_compressed(dn->inode, + index, blkaddr, + F2FS_I(dn->inode)->i_cluster_size, + c_len); + } +out: return 0; release_pages: From b4f76651ef782fd726ece1e3b9e45e10c9525399 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 6 Aug 2021 08:05:58 +0800 Subject: [PATCH 300/580] f2fs: avoid unneeded memory allocation in __add_ino_entry() __add_ino_entry() will allocate slab cache even if we have already cached ino entry in radix tree, e.g. for case of multiple devices. Let's check radix tree first under protection of rcu lock to see whether we need to do slab allocation, it will mitigate memory pressure from "f2fs_ino_entry" slab cache. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f6f3685ec7b8..fba8100b0333 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -465,16 +465,28 @@ static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, unsigned int devidx, int type) { struct inode_management *im = &sbi->im[type]; - struct ino_entry *e, *tmp; + struct ino_entry *e = NULL, *new = NULL; - tmp = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_NOFS); + if (type == FLUSH_INO) { + rcu_read_lock(); + e = radix_tree_lookup(&im->ino_root, ino); + rcu_read_unlock(); + } + +retry: + if (!e) + new = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_NOFS); radix_tree_preload(GFP_NOFS | __GFP_NOFAIL); spin_lock(&im->ino_lock); e = radix_tree_lookup(&im->ino_root, ino); if (!e) { - e = tmp; + if (!new) { + spin_unlock(&im->ino_lock); + goto retry; + } + e = new; if (unlikely(radix_tree_insert(&im->ino_root, ino, e))) f2fs_bug_on(sbi, 1); @@ -492,8 +504,8 @@ static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, spin_unlock(&im->ino_lock); radix_tree_preload_end(); - if (e != tmp) - kmem_cache_free(ino_entry_slab, tmp); + if (new && e != new) + kmem_cache_free(ino_entry_slab, new); } static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) From 8a1d7ef1749761cb29c06d9ac1a84b4e9d9e538c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 6 Aug 2021 08:04:37 +0800 Subject: [PATCH 301/580] f2fs: fix to do sanity check for sb/cp fields correctly This patch fixes below problems of sb/cp sanity check: - in sanity_check_raw_superi(), it missed to consider log header blocks while cp_payload check. - in f2fs_sanity_check_ckpt(), it missed to check nat_bits_blocks. Cc: Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1402040c29c7..8908d8c13690 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3231,11 +3231,13 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return -EFSCORRUPTED; } - if (le32_to_cpu(raw_super->cp_payload) > - (blocks_per_seg - F2FS_CP_PACKS)) { - f2fs_info(sbi, "Insane cp_payload (%u > %u)", + if (le32_to_cpu(raw_super->cp_payload) >= + (blocks_per_seg - F2FS_CP_PACKS - + NR_CURSEG_PERSIST_TYPE)) { + f2fs_info(sbi, "Insane cp_payload (%u >= %u)", le32_to_cpu(raw_super->cp_payload), - blocks_per_seg - F2FS_CP_PACKS); + blocks_per_seg - F2FS_CP_PACKS - + NR_CURSEG_PERSIST_TYPE); return -EFSCORRUPTED; } @@ -3271,6 +3273,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) unsigned int cp_pack_start_sum, cp_payload; block_t user_block_count, valid_user_blocks; block_t avail_node_count, valid_node_count; + unsigned int nat_blocks, nat_bits_bytes, nat_bits_blocks; int i, j; total = le32_to_cpu(raw_super->segment_count); @@ -3401,6 +3404,17 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) return 1; } + nat_blocks = nat_segs << log_blocks_per_seg; + nat_bits_bytes = nat_blocks / BITS_PER_BYTE; + nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8); + if (__is_set_ckpt_flags(ckpt, CP_NAT_BITS_FLAG) && + (cp_payload + F2FS_CP_PACKS + + NR_CURSEG_PERSIST_TYPE + nat_bits_blocks >= blocks_per_seg)) { + f2fs_warn(sbi, "Insane cp_payload: %u, nat_bits_blocks: %u)", + cp_payload, nat_bits_blocks); + return -EFSCORRUPTED; + } + if (unlikely(f2fs_cp_error(sbi))) { f2fs_err(sbi, "A bug case: need to run fsck"); return 1; From 42b3759275c1de418e4b0c11ed90db0efb674e3c Mon Sep 17 00:00:00 2001 From: Fengnan Chang Date: Thu, 12 Aug 2021 19:36:41 +0800 Subject: [PATCH 302/580] f2fs: compress: avoid duplicate counting of valid blocks when read compressed file Since cluster is basic unit of compression, one cluster is compressed or not, so we can calculate valid blocks only for first page in cluster, the other pages just skip. Signed-off-by: Fengnan Chang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index fa0d776b3854..19e6fb63a065 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2266,6 +2266,7 @@ int f2fs_mpage_readpages(struct address_space *mapping, .nr_rpages = 0, .nr_cpages = 0, }; + pgoff_t nc_cluster_idx = NULL_CLUSTER; #endif unsigned max_nr_pages = nr_pages; int ret = 0; @@ -2316,12 +2317,23 @@ int f2fs_mpage_readpages(struct address_space *mapping, if (ret) goto set_error_page; } - ret = f2fs_is_compressed_cluster(inode, page->index); - if (ret < 0) - goto set_error_page; - else if (!ret) - goto read_single_page; + if (cc.cluster_idx == NULL_CLUSTER) { + if (nc_cluster_idx == + page->index >> cc.log_cluster_size) { + goto read_single_page; + } + ret = f2fs_is_compressed_cluster(inode, page->index); + if (ret < 0) + goto set_error_page; + else if (!ret) { + nc_cluster_idx = + page->index >> cc.log_cluster_size; + goto read_single_page; + } + + nc_cluster_idx = NULL_CLUSTER; + } ret = f2fs_init_compress_ctx(&cc); if (ret) goto set_error_page; From 0c05d0a193399685c69970e541802d2a1ffd8aa3 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Wed, 11 Aug 2021 12:45:57 +0800 Subject: [PATCH 303/580] f2fs: improve sbi status info in debugfs/f2fs/status Do not use numbers but strings to improve readability when flag is set. Signed-off-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 473ad04d1891..d8c09346545d 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -323,22 +323,43 @@ static void update_mem_info(struct f2fs_sb_info *sbi) #endif } +static char *s_flag[] = { + [SBI_IS_DIRTY] = " fs_dirty", + [SBI_IS_CLOSE] = " closing", + [SBI_NEED_FSCK] = " need_fsck", + [SBI_POR_DOING] = " recovering", + [SBI_NEED_SB_WRITE] = " sb_dirty", + [SBI_NEED_CP] = " need_cp", + [SBI_IS_SHUTDOWN] = " shutdown", + [SBI_IS_RECOVERED] = " recovered", + [SBI_CP_DISABLED] = " cp_disabled", + [SBI_CP_DISABLED_QUICK] = " cp_disabled_quick", + [SBI_QUOTA_NEED_FLUSH] = " quota_need_flush", + [SBI_QUOTA_SKIP_FLUSH] = " quota_skip_flush", + [SBI_QUOTA_NEED_REPAIR] = " quota_need_repair", + [SBI_IS_RESIZEFS] = " resizefs", +}; + static int stat_show(struct seq_file *s, void *v) { struct f2fs_stat_info *si; - int i = 0; - int j; + int i = 0, j = 0; mutex_lock(&f2fs_stat_mutex); list_for_each_entry(si, &f2fs_stat_list, stat_list) { update_general_status(si->sbi); - seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s (sbi: 0x%lx)]=====\n", + seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s]=====\n", si->sbi->sb->s_bdev, i++, f2fs_readonly(si->sbi->sb) ? "RO": "RW", is_set_ckpt_flags(si->sbi, CP_DISABLED_FLAG) ? - "Disabled": (f2fs_cp_error(si->sbi) ? "Error": "Good"), - si->sbi->s_flag); + "Disabled" : (f2fs_cp_error(si->sbi) ? "Error" : "Good")); + if (si->sbi->s_flag) { + seq_puts(s, "[SBI:"); + for_each_set_bit(j, &si->sbi->s_flag, 32) + seq_puts(s, s_flag[j]); + seq_puts(s, "]\n"); + } seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ", si->sit_area_segs, si->nat_area_segs); seq_printf(s, "[SSA: %d] [MAIN: %d", From 85edf8301e8164d903411ede0d757359e11cf428 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 10 Aug 2021 21:27:06 +0800 Subject: [PATCH 304/580] f2fs: correct comment in segment.h s/two/three Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index dab87ecba2b5..c030b4436617 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -142,7 +142,7 @@ enum { }; /* - * In the victim_sel_policy->alloc_mode, there are two block allocation modes. + * In the victim_sel_policy->alloc_mode, there are three block allocation modes. * LFS writes data sequentially with cleaning operations. * SSR (Slack Space Recycle) reuses obsolete space without cleaning operations. * AT_SSR (Age Threshold based Slack Space Recycle) merges fragments into @@ -155,7 +155,7 @@ enum { }; /* - * In the victim_sel_policy->gc_mode, there are two gc, aka cleaning, modes. + * In the victim_sel_policy->gc_mode, there are three gc, aka cleaning, modes. * GC_CB is based on cost-benefit algorithm. * GC_GREEDY is based on greedy algorithm. * GC_AT is based on age-threshold algorithm. From 1a1aa8a51cef399460911fa6ec25bc40ad4336e1 Mon Sep 17 00:00:00 2001 From: Fengnan Chang Date: Mon, 9 Aug 2021 10:21:04 +0800 Subject: [PATCH 305/580] f2fs: compress: allow write compress released file after truncate to zero For compressed file, after release compress blocks, don't allow write direct, but we should allow write direct after truncate to zero. Reviewed-by: Chao Yu Signed-off-by: Fengnan Chang Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 7 +++++-- fs/f2fs/file.c | 8 ++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 8f251a662542..9b0517d90063 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -865,8 +865,11 @@ Compression implementation directly in order to guarantee potential data updates later to the space. Instead, the main goal is to reduce data writes to flash disk as much as possible, resulting in extending disk life time as well as relaxing IO - congestion. Alternatively, we've added ioctl interface to reclaim compressed - space and show it to user after putting the immutable bit. + congestion. Alternatively, we've added ioctl(F2FS_IOC_RELEASE_COMPRESS_BLOCKS) + interface to reclaim compressed space and show it to user after putting the + immutable bit. Immutable bit, after release, it doesn't allow writing/mmaping + on the file, until reserving compressed space via + ioctl(F2FS_IOC_RESERVE_COMPRESS_BLOCKS) or truncating filesize to zero. Compress metadata layout:: diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 91ebf8fc712d..911cbd37140c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -770,6 +770,14 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock) return err; #ifdef CONFIG_F2FS_FS_COMPRESSION + /* + * For compressed file, after release compress blocks, don't allow write + * direct, but we should allow write direct after truncate to zero. + */ + if (f2fs_compressed_file(inode) && !free_from + && is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) + clear_inode_flag(inode, FI_COMPRESS_RELEASED); + if (from != free_from) { err = f2fs_truncate_partial_cluster(inode, from, lock); if (err) From 8766fdd083e2312c5230c0e0b13c492d200d39d1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 9 Aug 2021 08:24:48 +0800 Subject: [PATCH 306/580] f2fs: support fault injection for f2fs_kmem_cache_alloc() This patch supports to inject fault into f2fs_kmem_cache_alloc(). Usage: a) echo 32768 > /sys/fs/f2fs//inject_type or b) mount -o fault_type=32768 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 1 + fs/f2fs/checkpoint.c | 3 ++- fs/f2fs/compress.c | 8 +++++--- fs/f2fs/data.c | 2 +- fs/f2fs/dir.c | 4 ++-- fs/f2fs/extent_cache.c | 5 +++-- fs/f2fs/f2fs.h | 17 ++++++++++++++++- fs/f2fs/gc.c | 6 ++++-- fs/f2fs/node.c | 23 ++++++++++++----------- fs/f2fs/recovery.c | 3 ++- fs/f2fs/segment.c | 10 ++++++---- fs/f2fs/super.c | 4 +++- fs/f2fs/xattr.c | 3 ++- 13 files changed, 59 insertions(+), 30 deletions(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 9b0517d90063..21d40e3cfd7a 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -195,6 +195,7 @@ fault_type=%d Support configuring fault injection type, should be FAULT_CHECKPOINT 0x000001000 FAULT_DISCARD 0x000002000 FAULT_WRITE_IO 0x000004000 + FAULT_SLAB_ALLOC 0x000008000 =================== =========== mode=%s Control block allocation mode which supports "adaptive" and "lfs". In "lfs" mode, there should be no random diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index fba8100b0333..7f9f855d94af 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -475,7 +475,8 @@ static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, retry: if (!e) - new = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_NOFS); + new = f2fs_kmem_cache_alloc(ino_entry_slab, + GFP_NOFS, true, NULL); radix_tree_preload(GFP_NOFS | __GFP_NOFAIL); diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index bcec72f7f81b..782333a52010 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -29,7 +29,8 @@ static void *page_array_alloc(struct inode *inode, int nr) unsigned int size = sizeof(struct page *) * nr; if (likely(size <= sbi->page_array_slab_size)) - return kmem_cache_zalloc(sbi->page_array_slab, GFP_NOFS); + return f2fs_kmem_cache_alloc(sbi->page_array_slab, + GFP_F2FS_ZERO, false, F2FS_I_SB(inode)); return f2fs_kzalloc(sbi, size, GFP_NOFS); } @@ -1199,7 +1200,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, fio.version = ni.version; - cic = kmem_cache_zalloc(cic_entry_slab, GFP_NOFS); + cic = f2fs_kmem_cache_alloc(cic_entry_slab, GFP_F2FS_ZERO, false, sbi); if (!cic) goto out_put_dnode; @@ -1477,7 +1478,8 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) pgoff_t start_idx = start_idx_of_cluster(cc); int i; - dic = kmem_cache_zalloc(dic_entry_slab, GFP_NOFS); + dic = f2fs_kmem_cache_alloc(dic_entry_slab, GFP_F2FS_ZERO, + false, F2FS_I_SB(cc->inode)); if (!dic) return ERR_PTR(-ENOMEM); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 19e6fb63a065..ce07fc9b58a3 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -702,7 +702,7 @@ static void add_bio_entry(struct f2fs_sb_info *sbi, struct bio *bio, struct f2fs_bio_info *io = sbi->write_io[DATA] + temp; struct bio_entry *be; - be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS); + be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS, true, NULL); be->bio = bio; bio_get(bio); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 90847b267826..3ea73c44906d 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -83,8 +83,8 @@ int f2fs_init_casefolded_name(const struct inode *dir, struct super_block *sb = dir->i_sb; if (IS_CASEFOLDED(dir)) { - fname->cf_name.name = kmem_cache_alloc(f2fs_cf_name_slab, - GFP_NOFS); + fname->cf_name.name = f2fs_kmem_cache_alloc(f2fs_cf_name_slab, + GFP_NOFS, false, F2FS_SB(sb)); if (!fname->cf_name.name) return -ENOMEM; fname->cf_name.len = utf8_casefold(sb->s_encoding, diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index b120589d8517..866e72b29bd5 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -239,7 +239,7 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, { struct extent_node *en; - en = kmem_cache_alloc(extent_node_slab, GFP_ATOMIC); + en = f2fs_kmem_cache_alloc(extent_node_slab, GFP_ATOMIC, false, sbi); if (!en) return NULL; @@ -292,7 +292,8 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) mutex_lock(&sbi->extent_tree_lock); et = radix_tree_lookup(&sbi->extent_tree_root, ino); if (!et) { - et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS); + et = f2fs_kmem_cache_alloc(extent_tree_slab, + GFP_NOFS, true, NULL); f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et); memset(et, 0, sizeof(struct extent_tree)); et->ino = ino; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 400b22d09394..a1b9ec19baf1 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -52,6 +52,7 @@ enum { FAULT_CHECKPOINT, FAULT_DISCARD, FAULT_WRITE_IO, + FAULT_SLAB_ALLOC, FAULT_MAX, }; @@ -2622,7 +2623,7 @@ static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name, return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, NULL); } -static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, +static inline void *f2fs_kmem_cache_alloc_nofail(struct kmem_cache *cachep, gfp_t flags) { void *entry; @@ -2633,6 +2634,20 @@ static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, return entry; } +static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, + gfp_t flags, bool nofail, struct f2fs_sb_info *sbi) +{ + if (nofail) + return f2fs_kmem_cache_alloc_nofail(cachep, flags); + + if (time_to_inject(sbi, FAULT_SLAB_ALLOC)) { + f2fs_show_injection_info(sbi, FAULT_SLAB_ALLOC); + return NULL; + } + + return kmem_cache_alloc(cachep, flags); +} + static inline bool is_inflight_io(struct f2fs_sb_info *sbi, int type) { if (get_pages(sbi, F2FS_RD_DATA) || get_pages(sbi, F2FS_RD_NODE) || diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 9dce44619069..3bc0f0162e31 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -371,7 +371,8 @@ static struct victim_entry *attach_victim_entry(struct f2fs_sb_info *sbi, struct atgc_management *am = &sbi->am; struct victim_entry *ve; - ve = f2fs_kmem_cache_alloc(victim_entry_slab, GFP_NOFS); + ve = f2fs_kmem_cache_alloc(victim_entry_slab, + GFP_NOFS, true, NULL); ve->mtime = mtime; ve->segno = segno; @@ -849,7 +850,8 @@ static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode) iput(inode); return; } - new_ie = f2fs_kmem_cache_alloc(f2fs_inode_entry_slab, GFP_NOFS); + new_ie = f2fs_kmem_cache_alloc(f2fs_inode_entry_slab, + GFP_NOFS, true, NULL); new_ie->inode = inode; f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b01ff7d499a7..3f560f3579d3 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -162,14 +162,13 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) return dst_page; } -static struct nat_entry *__alloc_nat_entry(nid_t nid, bool no_fail) +static struct nat_entry *__alloc_nat_entry(struct f2fs_sb_info *sbi, + nid_t nid, bool no_fail) { struct nat_entry *new; - if (no_fail) - new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_F2FS_ZERO); - else - new = kmem_cache_alloc(nat_entry_slab, GFP_F2FS_ZERO); + new = f2fs_kmem_cache_alloc(nat_entry_slab, + GFP_F2FS_ZERO, no_fail, sbi); if (new) { nat_set_nid(new, nid); nat_reset_flag(new); @@ -242,7 +241,8 @@ static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i, head = radix_tree_lookup(&nm_i->nat_set_root, set); if (!head) { - head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_NOFS); + head = f2fs_kmem_cache_alloc(nat_entry_set_slab, + GFP_NOFS, true, NULL); INIT_LIST_HEAD(&head->entry_list); INIT_LIST_HEAD(&head->set_list); @@ -329,7 +329,8 @@ static unsigned int f2fs_add_fsync_node_entry(struct f2fs_sb_info *sbi, unsigned long flags; unsigned int seq_id; - fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab, GFP_NOFS); + fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab, + GFP_NOFS, true, NULL); get_page(page); fn->page = page; @@ -428,7 +429,7 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *new, *e; - new = __alloc_nat_entry(nid, false); + new = __alloc_nat_entry(sbi, nid, false); if (!new) return; @@ -451,7 +452,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; - struct nat_entry *new = __alloc_nat_entry(ni->nid, true); + struct nat_entry *new = __alloc_nat_entry(sbi, ni->nid, true); down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ni->nid); @@ -2252,7 +2253,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, if (unlikely(f2fs_check_nid_range(sbi, nid))) return false; - i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS); + i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS, true, NULL); i->nid = nid; i->state = FREE_NID; @@ -2842,7 +2843,7 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) ne = __lookup_nat_cache(nm_i, nid); if (!ne) { - ne = __alloc_nat_entry(nid, true); + ne = __alloc_nat_entry(sbi, nid, true); __init_nat_entry(nm_i, ne, &raw_ne, true); } diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index e39371760be8..1528a43123f7 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -91,7 +91,8 @@ static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi, goto err_out; } - entry = f2fs_kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); + entry = f2fs_kmem_cache_alloc(fsync_entry_slab, + GFP_F2FS_ZERO, true, NULL); entry->inode = inode; list_add_tail(&entry->list, head); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5e20e53d8a06..3c297b1ffaf0 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -188,7 +188,8 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) set_page_private_atomic(page); - new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); + new = f2fs_kmem_cache_alloc(inmem_entry_slab, + GFP_NOFS, true, NULL); /* add atomic page indices to the list */ new->page = page; @@ -1001,7 +1002,7 @@ static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, pend_list = &dcc->pend_list[plist_idx(len)]; - dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS); + dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS, true, NULL); INIT_LIST_HEAD(&dc->list); dc->bdev = bdev; dc->lstart = lstart; @@ -1961,7 +1962,7 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, if (!de) { de = f2fs_kmem_cache_alloc(discard_entry_slab, - GFP_F2FS_ZERO); + GFP_F2FS_ZERO, true, NULL); de->start_blkaddr = START_BLOCK(sbi, cpc->trim_start); list_add_tail(&de->list, head); } @@ -4098,7 +4099,8 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, static struct sit_entry_set *grab_sit_entry_set(void) { struct sit_entry_set *ses = - f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_NOFS); + f2fs_kmem_cache_alloc(sit_entry_set_slab, + GFP_NOFS, true, NULL); ses->entry_cnt = 0; INIT_LIST_HEAD(&ses->set_list); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8908d8c13690..d883a52d0ac5 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -55,6 +55,7 @@ const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_CHECKPOINT] = "checkpoint error", [FAULT_DISCARD] = "discard error", [FAULT_WRITE_IO] = "write IO error", + [FAULT_SLAB_ALLOC] = "slab alloc", }; void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, @@ -1282,7 +1283,8 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) { struct f2fs_inode_info *fi; - fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_F2FS_ZERO); + fi = f2fs_kmem_cache_alloc(f2fs_inode_cachep, + GFP_F2FS_ZERO, false, F2FS_SB(sb)); if (!fi) return NULL; diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 3d43b27bad95..7f59039aef85 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -27,7 +27,8 @@ static void *xattr_alloc(struct f2fs_sb_info *sbi, int size, bool *is_inline) { if (likely(size == sbi->inline_xattr_slab_size)) { *is_inline = true; - return kmem_cache_zalloc(sbi->inline_xattr_slab, GFP_NOFS); + return f2fs_kmem_cache_alloc(sbi->inline_xattr_slab, + GFP_F2FS_ZERO, false, sbi); } *is_inline = false; return f2fs_kzalloc(sbi, size, GFP_NOFS); From 60eff273d00c0c6b70ecb4ee193fa86335d1c9ce Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 6 Aug 2021 10:45:20 +0800 Subject: [PATCH 307/580] f2fs: fix to keep compatibility of fault injection interface The value of FAULT_* macros and its description in f2fs.rst became inconsistent, fix this to keep compatibility of fault injection interface. Fixes: 67883ade7a98 ("f2fs: remove FAULT_ALLOC_BIO") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 1 + fs/f2fs/f2fs.h | 1 + 2 files changed, 2 insertions(+) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 21d40e3cfd7a..09de6ebbbdfa 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -185,6 +185,7 @@ fault_type=%d Support configuring fault injection type, should be FAULT_KVMALLOC 0x000000002 FAULT_PAGE_ALLOC 0x000000004 FAULT_PAGE_GET 0x000000008 + FAULT_ALLOC_BIO 0x000000010 (obsolete) FAULT_ALLOC_NID 0x000000020 FAULT_ORPHAN 0x000000040 FAULT_BLOCK 0x000000080 diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a1b9ec19baf1..9dc90cbe6789 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -42,6 +42,7 @@ enum { FAULT_KVMALLOC, FAULT_PAGE_ALLOC, FAULT_PAGE_GET, + FAULT_ALLOC_BIO, /* it's obsolete due to bio_alloc() will never fail */ FAULT_ALLOC_NID, FAULT_ORPHAN, FAULT_BLOCK, From ceee8bbb4d6e001759c352d356e4a6c239bce838 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Sat, 14 Aug 2021 18:37:01 +0800 Subject: [PATCH 308/580] f2fs: convert S_IRUGO to 0444 To fix: WARNING: Symbolic permissions 'S_IRUGO' are not preferred. Consider using octal permissions '0444'. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 2 +- fs/f2fs/sysfs.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index d8c09346545d..8c50518475a9 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -642,7 +642,7 @@ void __init f2fs_create_root_stats(void) #ifdef CONFIG_DEBUG_FS f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL); - debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, NULL, + debugfs_create_file("status", 0444, f2fs_debugfs_root, NULL, &stat_fops); #endif } diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index a062d1edfbd1..21cbc2d78e4e 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -1242,13 +1242,13 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); if (sbi->s_proc) { - proc_create_single_data("segment_info", S_IRUGO, sbi->s_proc, + proc_create_single_data("segment_info", 0444, sbi->s_proc, segment_info_seq_show, sb); - proc_create_single_data("segment_bits", S_IRUGO, sbi->s_proc, + proc_create_single_data("segment_bits", 0444, sbi->s_proc, segment_bits_seq_show, sb); - proc_create_single_data("iostat_info", S_IRUGO, sbi->s_proc, + proc_create_single_data("iostat_info", 0444, sbi->s_proc, iostat_info_seq_show, sb); - proc_create_single_data("victim_bits", S_IRUGO, sbi->s_proc, + proc_create_single_data("victim_bits", 0444, sbi->s_proc, victim_bits_seq_show, sb); } return 0; From fa2398f0a9c7e6ac78b16f4a4c6a069f41769b12 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Sun, 15 Aug 2021 01:58:40 +0800 Subject: [PATCH 309/580] f2fs: fix description about main_blkaddr node Don't leave a blank line, to keep the style consistent with other node descriptions. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index d3e441415ed0..9c89ad131304 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -38,8 +38,7 @@ Description: This parameter controls the number of prefree segments to be What: /sys/fs/f2fs//main_blkaddr Date: November 2019 Contact: "Ramon Pantin" -Description: - Shows first block address of MAIN area. +Description: Shows first block address of MAIN area. What: /sys/fs/f2fs//ipu_policy Date: November 2013 From d8e5d1aca64eb352f1eb9fa64354a8e7d12e8194 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 6 Aug 2021 08:02:50 +0800 Subject: [PATCH 310/580] f2fs: compress: do sanity check on cluster This patch adds f2fs_sanity_check_cluster() to support doing sanity check on cluster of compressed file, it will be triggered from below two paths: - __f2fs_cluster_blocks() - f2fs_map_blocks(F2FS_GET_BLOCK_FIEMAP) And it can detect below three kind of cluster insanity status. C: COMPRESS_ADDR N: NULL_ADDR or NEW_ADDR V: valid blkaddr *: any value 1. [*|C|*|*] 2. [C|*|C|*] 3. [C|N|N|V] Signed-off-by: Chao Yu [Nathan Chancellor: fix missing inline warning] Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++ fs/f2fs/data.c | 7 ++++++ fs/f2fs/f2fs.h | 2 ++ 3 files changed, 62 insertions(+) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 782333a52010..df6f08c2613c 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -870,6 +870,54 @@ static bool cluster_has_invalid_data(struct compress_ctx *cc) return false; } +bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + unsigned int cluster_size = F2FS_I(dn->inode)->i_cluster_size; + bool compressed = dn->data_blkaddr == COMPRESS_ADDR; + int cluster_end = 0; + int i; + char *reason = ""; + + if (!compressed) + return false; + + /* [..., COMPR_ADDR, ...] */ + if (dn->ofs_in_node % cluster_size) { + reason = "[*|C|*|*]"; + goto out; + } + + for (i = 1; i < cluster_size; i++) { + block_t blkaddr = data_blkaddr(dn->inode, dn->node_page, + dn->ofs_in_node + i); + + /* [COMPR_ADDR, ..., COMPR_ADDR] */ + if (blkaddr == COMPRESS_ADDR) { + reason = "[C|*|C|*]"; + goto out; + } + if (compressed) { + if (!__is_valid_data_blkaddr(blkaddr)) { + if (!cluster_end) + cluster_end = i; + continue; + } + /* [COMPR_ADDR, NULL_ADDR or NEW_ADDR, valid_blkaddr] */ + if (cluster_end) { + reason = "[C|N|N|V]"; + goto out; + } + } + } + return false; +out: + f2fs_warn(sbi, "access invalid cluster, ino:%lu, nid:%u, ofs_in_node:%u, reason:%s", + dn->inode->i_ino, dn->nid, dn->ofs_in_node, reason); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return true; +} + static int __f2fs_cluster_blocks(struct inode *inode, unsigned int cluster_idx, bool compr) { @@ -887,6 +935,11 @@ static int __f2fs_cluster_blocks(struct inode *inode, goto fail; } + if (f2fs_sanity_check_cluster(&dn)) { + ret = -EFSCORRUPTED; + goto fail; + } + if (dn.data_blkaddr == COMPRESS_ADDR) { int i; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ce07fc9b58a3..46f0abc2a0e2 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1514,6 +1514,13 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, map->m_flags |= F2FS_MAP_NEW; blkaddr = dn.data_blkaddr; } else { + if (f2fs_compressed_file(inode) && + f2fs_sanity_check_cluster(&dn) && + (flag != F2FS_GET_BLOCK_FIEMAP || + IS_ENABLED(CONFIG_F2FS_CHECK_FS))) { + err = -EFSCORRUPTED; + goto sync_out; + } if (flag == F2FS_GET_BLOCK_BMAP) { map->m_pblk = 0; goto sync_out; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9dc90cbe6789..bc8c76992963 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4123,6 +4123,7 @@ void f2fs_end_read_compressed_page(struct page *page, bool failed, block_t blkaddr); bool f2fs_cluster_is_empty(struct compress_ctx *cc); bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index); +bool f2fs_sanity_check_cluster(struct dnode_of_data *dn); void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page); int f2fs_write_multi_pages(struct compress_ctx *cc, int *submitted, @@ -4194,6 +4195,7 @@ static inline void f2fs_put_page_dic(struct page *page) WARN_ON_ONCE(1); } static inline unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn) { return 0; } +static inline bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) { return false; } static inline int f2fs_init_compress_inode(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi) { } static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return 0; } From 132506ae8348727b27f5e8691f49ccd2036663a8 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Thu, 19 Aug 2021 20:52:28 -0700 Subject: [PATCH 311/580] f2fs: separate out iostat feature Added F2FS_IOSTAT config option to support getting IO statistics through sysfs and printing out periodic IO statistics tracepoint events and moved I/O statistics related codes into separate files for better maintenance. Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu [Jaegeuk Kim: set default=y] Signed-off-by: Jaegeuk Kim --- fs/f2fs/Kconfig | 9 +++ fs/f2fs/Makefile | 1 + fs/f2fs/checkpoint.c | 1 + fs/f2fs/data.c | 1 + fs/f2fs/f2fs.h | 59 +++----------- fs/f2fs/file.c | 1 + fs/f2fs/gc.c | 1 + fs/f2fs/iostat.c | 154 ++++++++++++++++++++++++++++++++++++ fs/f2fs/iostat.h | 27 +++++++ fs/f2fs/node.c | 1 + fs/f2fs/segment.c | 1 + fs/f2fs/super.c | 10 +-- fs/f2fs/sysfs.c | 106 +++---------------------- include/trace/events/f2fs.h | 2 + 14 files changed, 225 insertions(+), 149 deletions(-) create mode 100644 fs/f2fs/iostat.c create mode 100644 fs/f2fs/iostat.h diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 0a66c0605866..0da1790e036a 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -141,3 +141,12 @@ config F2FS_FS_ZSTD default y help Support ZSTD compress algorithm, if unsure, say Y. + +config F2FS_IOSTAT + bool "F2FS IO statistics information" + depends on F2FS_FS + default y + help + Support getting IO statistics through sysfs and printing out periodic + IO statistics tracepoint events. You have to turn on "iostat_enable" + sysfs node to enable this feature. diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index e5295746208b..8a7322d229e4 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -9,3 +9,4 @@ f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o f2fs-$(CONFIG_FS_VERITY) += verity.o f2fs-$(CONFIG_F2FS_FS_COMPRESSION) += compress.o +f2fs-$(CONFIG_F2FS_IOSTAT) += iostat.o diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 7f9f855d94af..e3b828cd660d 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -18,6 +18,7 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include "iostat.h" #include #define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 46f0abc2a0e2..26e4dffb9888 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -23,6 +23,7 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include "iostat.h" #include #define NUM_PREALLOC_POST_READ_CTXS 128 diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index bc8c76992963..10297cea4075 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1718,14 +1718,6 @@ struct f2fs_sb_info { #endif spinlock_t stat_lock; /* lock for stat operations */ - /* For app/fs IO statistics */ - spinlock_t iostat_lock; - unsigned long long rw_iostat[NR_IO_TYPE]; - unsigned long long prev_rw_iostat[NR_IO_TYPE]; - bool iostat_enable; - unsigned long iostat_next_period; - unsigned int iostat_period_ms; - /* to attach REQ_META|REQ_FUA flags */ unsigned int data_io_flag; unsigned int node_io_flag; @@ -1783,6 +1775,16 @@ struct f2fs_sb_info { unsigned int compress_watermark; /* cache page watermark */ atomic_t compress_page_hit; /* cache hit count */ #endif + +#ifdef CONFIG_F2FS_IOSTAT + /* For app/fs IO statistics */ + spinlock_t iostat_lock; + unsigned long long rw_iostat[NR_IO_TYPE]; + unsigned long long prev_rw_iostat[NR_IO_TYPE]; + bool iostat_enable; + unsigned long iostat_next_period; + unsigned int iostat_period_ms; +#endif }; struct f2fs_private_dio { @@ -3261,47 +3263,6 @@ static inline int get_inline_xattr_addrs(struct inode *inode) sizeof((f2fs_inode)->field)) \ <= (F2FS_OLD_ATTRIBUTE_SIZE + (extra_isize))) \ -#define DEFAULT_IOSTAT_PERIOD_MS 3000 -#define MIN_IOSTAT_PERIOD_MS 100 -/* maximum period of iostat tracing is 1 day */ -#define MAX_IOSTAT_PERIOD_MS 8640000 - -static inline void f2fs_reset_iostat(struct f2fs_sb_info *sbi) -{ - int i; - - spin_lock(&sbi->iostat_lock); - for (i = 0; i < NR_IO_TYPE; i++) { - sbi->rw_iostat[i] = 0; - sbi->prev_rw_iostat[i] = 0; - } - spin_unlock(&sbi->iostat_lock); -} - -extern void f2fs_record_iostat(struct f2fs_sb_info *sbi); - -static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, - enum iostat_type type, unsigned long long io_bytes) -{ - if (!sbi->iostat_enable) - return; - spin_lock(&sbi->iostat_lock); - sbi->rw_iostat[type] += io_bytes; - - if (type == APP_WRITE_IO || type == APP_DIRECT_IO) - sbi->rw_iostat[APP_BUFFERED_IO] = - sbi->rw_iostat[APP_WRITE_IO] - - sbi->rw_iostat[APP_DIRECT_IO]; - - if (type == APP_READ_IO || type == APP_DIRECT_READ_IO) - sbi->rw_iostat[APP_BUFFERED_READ_IO] = - sbi->rw_iostat[APP_READ_IO] - - sbi->rw_iostat[APP_DIRECT_READ_IO]; - spin_unlock(&sbi->iostat_lock); - - f2fs_record_iostat(sbi); -} - #define __is_large_section(sbi) ((sbi)->segs_per_sec > 1) #define __is_meta_io(fio) (PAGE_TYPE_OF_BIO((fio)->type) == META) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 911cbd37140c..fe609f24688f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -29,6 +29,7 @@ #include "xattr.h" #include "acl.h" #include "gc.h" +#include "iostat.h" #include #include diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 3bc0f0162e31..2c18443972b6 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -19,6 +19,7 @@ #include "node.h" #include "segment.h" #include "gc.h" +#include "iostat.h" #include static struct kmem_cache *victim_entry_slab; diff --git a/fs/f2fs/iostat.c b/fs/f2fs/iostat.c new file mode 100644 index 000000000000..21c29e121a86 --- /dev/null +++ b/fs/f2fs/iostat.c @@ -0,0 +1,154 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * f2fs iostat support + * + * Copyright 2021 Google LLC + * Author: Daeho Jeong + */ + +#include +#include +#include + +#include "f2fs.h" +#include "iostat.h" +#include + +int __maybe_unused iostat_info_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + time64_t now = ktime_get_real_seconds(); + + if (!sbi->iostat_enable) + return 0; + + seq_printf(seq, "time: %-16llu\n", now); + + /* print app write IOs */ + seq_puts(seq, "[WRITE]\n"); + seq_printf(seq, "app buffered: %-16llu\n", + sbi->rw_iostat[APP_BUFFERED_IO]); + seq_printf(seq, "app direct: %-16llu\n", + sbi->rw_iostat[APP_DIRECT_IO]); + seq_printf(seq, "app mapped: %-16llu\n", + sbi->rw_iostat[APP_MAPPED_IO]); + + /* print fs write IOs */ + seq_printf(seq, "fs data: %-16llu\n", + sbi->rw_iostat[FS_DATA_IO]); + seq_printf(seq, "fs node: %-16llu\n", + sbi->rw_iostat[FS_NODE_IO]); + seq_printf(seq, "fs meta: %-16llu\n", + sbi->rw_iostat[FS_META_IO]); + seq_printf(seq, "fs gc data: %-16llu\n", + sbi->rw_iostat[FS_GC_DATA_IO]); + seq_printf(seq, "fs gc node: %-16llu\n", + sbi->rw_iostat[FS_GC_NODE_IO]); + seq_printf(seq, "fs cp data: %-16llu\n", + sbi->rw_iostat[FS_CP_DATA_IO]); + seq_printf(seq, "fs cp node: %-16llu\n", + sbi->rw_iostat[FS_CP_NODE_IO]); + seq_printf(seq, "fs cp meta: %-16llu\n", + sbi->rw_iostat[FS_CP_META_IO]); + + /* print app read IOs */ + seq_puts(seq, "[READ]\n"); + seq_printf(seq, "app buffered: %-16llu\n", + sbi->rw_iostat[APP_BUFFERED_READ_IO]); + seq_printf(seq, "app direct: %-16llu\n", + sbi->rw_iostat[APP_DIRECT_READ_IO]); + seq_printf(seq, "app mapped: %-16llu\n", + sbi->rw_iostat[APP_MAPPED_READ_IO]); + + /* print fs read IOs */ + seq_printf(seq, "fs data: %-16llu\n", + sbi->rw_iostat[FS_DATA_READ_IO]); + seq_printf(seq, "fs gc data: %-16llu\n", + sbi->rw_iostat[FS_GDATA_READ_IO]); + seq_printf(seq, "fs compr_data: %-16llu\n", + sbi->rw_iostat[FS_CDATA_READ_IO]); + seq_printf(seq, "fs node: %-16llu\n", + sbi->rw_iostat[FS_NODE_READ_IO]); + seq_printf(seq, "fs meta: %-16llu\n", + sbi->rw_iostat[FS_META_READ_IO]); + + /* print other IOs */ + seq_puts(seq, "[OTHER]\n"); + seq_printf(seq, "fs discard: %-16llu\n", + sbi->rw_iostat[FS_DISCARD]); + + return 0; +} + +static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi) +{ + unsigned long long iostat_diff[NR_IO_TYPE]; + int i; + + if (time_is_after_jiffies(sbi->iostat_next_period)) + return; + + /* Need double check under the lock */ + spin_lock(&sbi->iostat_lock); + if (time_is_after_jiffies(sbi->iostat_next_period)) { + spin_unlock(&sbi->iostat_lock); + return; + } + sbi->iostat_next_period = jiffies + + msecs_to_jiffies(sbi->iostat_period_ms); + + for (i = 0; i < NR_IO_TYPE; i++) { + iostat_diff[i] = sbi->rw_iostat[i] - + sbi->prev_rw_iostat[i]; + sbi->prev_rw_iostat[i] = sbi->rw_iostat[i]; + } + spin_unlock(&sbi->iostat_lock); + + trace_f2fs_iostat(sbi, iostat_diff); +} + +void f2fs_reset_iostat(struct f2fs_sb_info *sbi) +{ + int i; + + spin_lock(&sbi->iostat_lock); + for (i = 0; i < NR_IO_TYPE; i++) { + sbi->rw_iostat[i] = 0; + sbi->prev_rw_iostat[i] = 0; + } + spin_unlock(&sbi->iostat_lock); +} + +void f2fs_update_iostat(struct f2fs_sb_info *sbi, + enum iostat_type type, unsigned long long io_bytes) +{ + if (!sbi->iostat_enable) + return; + + spin_lock(&sbi->iostat_lock); + sbi->rw_iostat[type] += io_bytes; + + if (type == APP_WRITE_IO || type == APP_DIRECT_IO) + sbi->rw_iostat[APP_BUFFERED_IO] = + sbi->rw_iostat[APP_WRITE_IO] - + sbi->rw_iostat[APP_DIRECT_IO]; + + if (type == APP_READ_IO || type == APP_DIRECT_READ_IO) + sbi->rw_iostat[APP_BUFFERED_READ_IO] = + sbi->rw_iostat[APP_READ_IO] - + sbi->rw_iostat[APP_DIRECT_READ_IO]; + spin_unlock(&sbi->iostat_lock); + + f2fs_record_iostat(sbi); +} + +int f2fs_init_iostat(struct f2fs_sb_info *sbi) +{ + /* init iostat info */ + spin_lock_init(&sbi->iostat_lock); + sbi->iostat_enable = false; + sbi->iostat_period_ms = DEFAULT_IOSTAT_PERIOD_MS; + + return 0; +} diff --git a/fs/f2fs/iostat.h b/fs/f2fs/iostat.h new file mode 100644 index 000000000000..46e4a36fc8e9 --- /dev/null +++ b/fs/f2fs/iostat.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2021 Google LLC + * Author: Daeho Jeong + */ +#ifndef __F2FS_IOSTAT_H__ +#define __F2FS_IOSTAT_H__ + +#ifdef CONFIG_F2FS_IOSTAT + +#define DEFAULT_IOSTAT_PERIOD_MS 3000 +#define MIN_IOSTAT_PERIOD_MS 100 +/* maximum period of iostat tracing is 1 day */ +#define MAX_IOSTAT_PERIOD_MS 8640000 + +extern int __maybe_unused iostat_info_seq_show(struct seq_file *seq, + void *offset); +extern void f2fs_reset_iostat(struct f2fs_sb_info *sbi); +extern void f2fs_update_iostat(struct f2fs_sb_info *sbi, + enum iostat_type type, unsigned long long io_bytes); +extern int f2fs_init_iostat(struct f2fs_sb_info *sbi); +#else +static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, + enum iostat_type type, unsigned long long io_bytes) {} +static inline int f2fs_init_iostat(struct f2fs_sb_info *sbi) { return 0; } +#endif +#endif /* __F2FS_IOSTAT_H__ */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 3f560f3579d3..2356a047e9fe 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -17,6 +17,7 @@ #include "node.h" #include "segment.h" #include "xattr.h" +#include "iostat.h" #include #define on_f2fs_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 3c297b1ffaf0..418c3ef00d9c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -20,6 +20,7 @@ #include "segment.h" #include "node.h" #include "gc.h" +#include "iostat.h" #include #define __reverse_ffz(x) __reverse_ffs(~(x)) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d883a52d0ac5..88bf6b586ecf 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -32,6 +32,7 @@ #include "segment.h" #include "xattr.h" #include "gc.h" +#include "iostat.h" #define CREATE_TRACE_POINTS #include @@ -3923,11 +3924,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) set_sbi_flag(sbi, SBI_POR_DOING); spin_lock_init(&sbi->stat_lock); - /* init iostat info */ - spin_lock_init(&sbi->iostat_lock); - sbi->iostat_enable = false; - sbi->iostat_period_ms = DEFAULT_IOSTAT_PERIOD_MS; - for (i = 0; i < NR_PAGE_TYPE; i++) { int n = (i == META) ? 1 : NR_TEMP_TYPE; int j; @@ -3958,6 +3954,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) init_waitqueue_head(&sbi->cp_wait); init_sb_info(sbi); + err = f2fs_init_iostat(sbi); + if (err) + goto free_bio_info; + err = init_percpu_info(sbi); if (err) goto free_bio_info; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 21cbc2d78e4e..40a6abdb7a27 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -17,6 +17,7 @@ #include "f2fs.h" #include "segment.h" #include "gc.h" +#include "iostat.h" #include static struct proc_dir_entry *f2fs_proc_root; @@ -480,6 +481,7 @@ static ssize_t __sbi_store(struct f2fs_attr *a, return count; } +#ifdef CONFIG_F2FS_IOSTAT if (!strcmp(a->attr.name, "iostat_enable")) { sbi->iostat_enable = !!t; if (!sbi->iostat_enable) @@ -495,6 +497,7 @@ static ssize_t __sbi_store(struct f2fs_attr *a, spin_unlock(&sbi->iostat_lock); return count; } +#endif #ifdef CONFIG_F2FS_FS_COMPRESSION if (!strcmp(a->attr.name, "compr_written_block") || @@ -695,8 +698,10 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, discard_idle_interval, F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle_interval, interval_time[GC_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, umount_discard_timeout, interval_time[UMOUNT_DISCARD_TIMEOUT]); +#ifdef CONFIG_F2FS_IOSTAT F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_period_ms, iostat_period_ms); +#endif F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_io_bytes, max_io_bytes); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold); @@ -801,8 +806,10 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(discard_idle_interval), ATTR_LIST(gc_idle_interval), ATTR_LIST(umount_discard_timeout), +#ifdef CONFIG_F2FS_IOSTAT ATTR_LIST(iostat_enable), ATTR_LIST(iostat_period_ms), +#endif ATTR_LIST(readdir_ra), ATTR_LIST(max_io_bytes), ATTR_LIST(gc_pin_file_thresh), @@ -1065,101 +1072,6 @@ static int __maybe_unused segment_bits_seq_show(struct seq_file *seq, return 0; } -void f2fs_record_iostat(struct f2fs_sb_info *sbi) -{ - unsigned long long iostat_diff[NR_IO_TYPE]; - int i; - - if (time_is_after_jiffies(sbi->iostat_next_period)) - return; - - /* Need double check under the lock */ - spin_lock(&sbi->iostat_lock); - if (time_is_after_jiffies(sbi->iostat_next_period)) { - spin_unlock(&sbi->iostat_lock); - return; - } - sbi->iostat_next_period = jiffies + - msecs_to_jiffies(sbi->iostat_period_ms); - - for (i = 0; i < NR_IO_TYPE; i++) { - iostat_diff[i] = sbi->rw_iostat[i] - - sbi->prev_rw_iostat[i]; - sbi->prev_rw_iostat[i] = sbi->rw_iostat[i]; - } - spin_unlock(&sbi->iostat_lock); - - trace_f2fs_iostat(sbi, iostat_diff); -} - -static int __maybe_unused iostat_info_seq_show(struct seq_file *seq, - void *offset) -{ - struct super_block *sb = seq->private; - struct f2fs_sb_info *sbi = F2FS_SB(sb); - time64_t now = ktime_get_real_seconds(); - - if (!sbi->iostat_enable) - return 0; - - seq_printf(seq, "time: %-16llu\n", now); - - /* print app write IOs */ - seq_puts(seq, "[WRITE]\n"); - seq_printf(seq, "app buffered: %-16llu\n", - sbi->rw_iostat[APP_BUFFERED_IO]); - seq_printf(seq, "app direct: %-16llu\n", - sbi->rw_iostat[APP_DIRECT_IO]); - seq_printf(seq, "app mapped: %-16llu\n", - sbi->rw_iostat[APP_MAPPED_IO]); - - /* print fs write IOs */ - seq_printf(seq, "fs data: %-16llu\n", - sbi->rw_iostat[FS_DATA_IO]); - seq_printf(seq, "fs node: %-16llu\n", - sbi->rw_iostat[FS_NODE_IO]); - seq_printf(seq, "fs meta: %-16llu\n", - sbi->rw_iostat[FS_META_IO]); - seq_printf(seq, "fs gc data: %-16llu\n", - sbi->rw_iostat[FS_GC_DATA_IO]); - seq_printf(seq, "fs gc node: %-16llu\n", - sbi->rw_iostat[FS_GC_NODE_IO]); - seq_printf(seq, "fs cp data: %-16llu\n", - sbi->rw_iostat[FS_CP_DATA_IO]); - seq_printf(seq, "fs cp node: %-16llu\n", - sbi->rw_iostat[FS_CP_NODE_IO]); - seq_printf(seq, "fs cp meta: %-16llu\n", - sbi->rw_iostat[FS_CP_META_IO]); - - /* print app read IOs */ - seq_puts(seq, "[READ]\n"); - seq_printf(seq, "app buffered: %-16llu\n", - sbi->rw_iostat[APP_BUFFERED_READ_IO]); - seq_printf(seq, "app direct: %-16llu\n", - sbi->rw_iostat[APP_DIRECT_READ_IO]); - seq_printf(seq, "app mapped: %-16llu\n", - sbi->rw_iostat[APP_MAPPED_READ_IO]); - - /* print fs read IOs */ - seq_printf(seq, "fs data: %-16llu\n", - sbi->rw_iostat[FS_DATA_READ_IO]); - seq_printf(seq, "fs gc data: %-16llu\n", - sbi->rw_iostat[FS_GDATA_READ_IO]); - seq_printf(seq, "fs compr_data: %-16llu\n", - sbi->rw_iostat[FS_CDATA_READ_IO]); - seq_printf(seq, "fs node: %-16llu\n", - sbi->rw_iostat[FS_NODE_READ_IO]); - seq_printf(seq, "fs meta: %-16llu\n", - sbi->rw_iostat[FS_META_READ_IO]); - - /* print other IOs */ - seq_puts(seq, "[OTHER]\n"); - seq_printf(seq, "fs discard: %-16llu\n", - sbi->rw_iostat[FS_DISCARD]); - - return 0; -} - static int __maybe_unused victim_bits_seq_show(struct seq_file *seq, void *offset) { @@ -1246,8 +1158,10 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) segment_info_seq_show, sb); proc_create_single_data("segment_bits", 0444, sbi->s_proc, segment_bits_seq_show, sb); +#ifdef CONFIG_F2FS_IOSTAT proc_create_single_data("iostat_info", 0444, sbi->s_proc, iostat_info_seq_show, sb); +#endif proc_create_single_data("victim_bits", 0444, sbi->s_proc, victim_bits_seq_show, sb); } @@ -1267,7 +1181,9 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) { if (sbi->s_proc) { +#ifdef CONFIG_F2FS_IOSTAT remove_proc_entry("iostat_info", sbi->s_proc); +#endif remove_proc_entry("segment_info", sbi->s_proc); remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry("victim_bits", sbi->s_proc); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 4d72bf206fd4..1a856dce344b 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1827,6 +1827,7 @@ DEFINE_EVENT(f2fs_zip_end, f2fs_decompress_pages_end, TP_ARGS(inode, cluster_idx, compressed_size, ret) ); +#ifdef CONFIG_F2FS_IOSTAT TRACE_EVENT(f2fs_iostat, TP_PROTO(struct f2fs_sb_info *sbi, unsigned long long *iostat), @@ -1902,6 +1903,7 @@ TRACE_EVENT(f2fs_iostat, __entry->app_mrio, __entry->fs_drio, __entry->fs_gdrio, __entry->fs_cdrio, __entry->fs_nrio, __entry->fs_mrio) ); +#endif TRACE_EVENT(f2fs_bmap, From 76e3c701971fca1015a840013cdaa7aa92e994f5 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Fri, 20 Aug 2021 15:29:09 -0700 Subject: [PATCH 312/580] f2fs: introduce periodic iostat io latency traces Whenever we notice some sluggish issues on our machines, we are always curious about how well all types of I/O in the f2fs filesystem are handled. But, it's hard to get this kind of real data. First of all, we need to reproduce the issue while turning on the profiling tool like blktrace, but the issue doesn't happen again easily. Second, with the intervention of any tools, the overall timing of the issue will be slightly changed and it sometimes makes us hard to figure it out. So, I added the feature printing out IO latency statistics tracepoint events, which are minimal things to understand filesystem's I/O related behaviors, into F2FS_IOSTAT kernel config. With "iostat_enable" sysfs node on, we can get this statistics info in a periodic way and it would cause the least overhead. [samples] f2fs_ckpt-254:1-507 [003] .... 2842.439683: f2fs_iostat_latency: dev = (254,11), iotype [peak lat.(ms)/avg lat.(ms)/count], rd_data [136/1/801], rd_node [136/1/1704], rd_meta [4/2/4], wr_sync_data [164/16/3331], wr_sync_node [152/3/648], wr_sync_meta [160/2/4243], wr_async_data [24/13/15], wr_async_node [0/0/0], wr_async_meta [0/0/0] f2fs_ckpt-254:1-507 [002] .... 2845.450514: f2fs_iostat_latency: dev = (254,11), iotype [peak lat.(ms)/avg lat.(ms)/count], rd_data [60/3/456], rd_node [60/3/1258], rd_meta [0/0/1], wr_sync_data [120/12/2285], wr_sync_node [88/5/428], wr_sync_meta [52/6/2990], wr_async_data [4/1/3], wr_async_node [0/0/0], wr_async_meta [0/0/0] Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 19 ++++-- fs/f2fs/f2fs.h | 4 ++ fs/f2fs/iostat.c | 133 ++++++++++++++++++++++++++++++++++++ fs/f2fs/iostat.h | 57 ++++++++++++++++ fs/f2fs/super.c | 13 +++- include/trace/events/f2fs.h | 95 ++++++++++++++++++++++++++ 6 files changed, 315 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 26e4dffb9888..84a793dff4d2 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -269,7 +269,10 @@ static void f2fs_post_read_work(struct work_struct *work) static void f2fs_read_end_io(struct bio *bio) { struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio)); - struct bio_post_read_ctx *ctx = bio->bi_private; + struct bio_post_read_ctx *ctx; + + iostat_update_and_unbind_ctx(bio, 0); + ctx = bio->bi_private; if (time_to_inject(sbi, FAULT_READ_IO)) { f2fs_show_injection_info(sbi, FAULT_READ_IO); @@ -291,10 +294,13 @@ static void f2fs_read_end_io(struct bio *bio) static void f2fs_write_end_io(struct bio *bio) { - struct f2fs_sb_info *sbi = bio->bi_private; + struct f2fs_sb_info *sbi; struct bio_vec *bvec; int iter_all; + iostat_update_and_unbind_ctx(bio, 1); + sbi = bio->bi_private; + if (time_to_inject(sbi, FAULT_WRITE_IO)) { f2fs_show_injection_info(sbi, FAULT_WRITE_IO); bio->bi_status = BLK_STS_IOERR; @@ -408,6 +414,8 @@ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi, fio->type, fio->temp); } + iostat_alloc_and_bind_ctx(sbi, bio, NULL); + if (fio->io_wbc) wbc_init_bio(fio->io_wbc, bio); @@ -462,6 +470,8 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, trace_f2fs_submit_read_bio(sbi->sb, type, bio); else trace_f2fs_submit_write_bio(sbi->sb, type, bio); + + iostat_update_submit_ctx(bio, type); submit_bio(bio); } @@ -936,7 +946,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio; - struct bio_post_read_ctx *ctx; + struct bio_post_read_ctx *ctx = NULL; unsigned int post_read_steps = 0; bio = bio_alloc_bioset(for_write ? GFP_NOIO : GFP_KERNEL, @@ -970,6 +980,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, ctx->fs_blkaddr = blkaddr; bio->bi_private = ctx; } + iostat_alloc_and_bind_ctx(sbi, bio, ctx); return bio; } @@ -2213,7 +2224,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, if (bio_add_page(bio, page, blocksize, 0) < blocksize) goto submit_and_realloc; - ctx = bio->bi_private; + ctx = get_post_read_ctx(bio); ctx->enabled_steps |= STEP_DECOMPRESS; refcount_inc(&dic->refcnt); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 10297cea4075..c6df19b0937f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1784,6 +1784,10 @@ struct f2fs_sb_info { bool iostat_enable; unsigned long iostat_next_period; unsigned int iostat_period_ms; + + /* For io latency related statistics info in one iostat period */ + spinlock_t iostat_lat_lock; + struct iostat_lat_info *iostat_io_lat; #endif }; diff --git a/fs/f2fs/iostat.c b/fs/f2fs/iostat.c index 21c29e121a86..cdcf54ae0db8 100644 --- a/fs/f2fs/iostat.c +++ b/fs/f2fs/iostat.c @@ -14,6 +14,10 @@ #include "iostat.h" #include +#define NUM_PREALLOC_IOSTAT_CTXS 128 +static struct kmem_cache *bio_iostat_ctx_cache; +static mempool_t *bio_iostat_ctx_pool; + int __maybe_unused iostat_info_seq_show(struct seq_file *seq, void *offset) { struct super_block *sb = seq->private; @@ -81,6 +85,32 @@ int __maybe_unused iostat_info_seq_show(struct seq_file *seq, void *offset) return 0; } +static inline void __record_iostat_latency(struct f2fs_sb_info *sbi) +{ + int io, idx = 0; + unsigned int cnt; + struct f2fs_iostat_latency iostat_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; + struct iostat_lat_info *io_lat = sbi->iostat_io_lat; + + spin_lock_irq(&sbi->iostat_lat_lock); + for (idx = 0; idx < MAX_IO_TYPE; idx++) { + for (io = 0; io < NR_PAGE_TYPE; io++) { + cnt = io_lat->bio_cnt[idx][io]; + iostat_lat[idx][io].peak_lat = + jiffies_to_msecs(io_lat->peak_lat[idx][io]); + iostat_lat[idx][io].cnt = cnt; + iostat_lat[idx][io].avg_lat = cnt ? + jiffies_to_msecs(io_lat->sum_lat[idx][io]) / cnt : 0; + io_lat->sum_lat[idx][io] = 0; + io_lat->peak_lat[idx][io] = 0; + io_lat->bio_cnt[idx][io] = 0; + } + } + spin_unlock_irq(&sbi->iostat_lat_lock); + + trace_f2fs_iostat_latency(sbi, iostat_lat); +} + static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi) { unsigned long long iostat_diff[NR_IO_TYPE]; @@ -106,10 +136,13 @@ static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi) spin_unlock(&sbi->iostat_lock); trace_f2fs_iostat(sbi, iostat_diff); + + __record_iostat_latency(sbi); } void f2fs_reset_iostat(struct f2fs_sb_info *sbi) { + struct iostat_lat_info *io_lat = sbi->iostat_io_lat; int i; spin_lock(&sbi->iostat_lock); @@ -118,6 +151,10 @@ void f2fs_reset_iostat(struct f2fs_sb_info *sbi) sbi->prev_rw_iostat[i] = 0; } spin_unlock(&sbi->iostat_lock); + + spin_lock_irq(&sbi->iostat_lat_lock); + memset(io_lat, 0, sizeof(struct iostat_lat_info)); + spin_unlock_irq(&sbi->iostat_lat_lock); } void f2fs_update_iostat(struct f2fs_sb_info *sbi, @@ -143,12 +180,108 @@ void f2fs_update_iostat(struct f2fs_sb_info *sbi, f2fs_record_iostat(sbi); } +static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx, + int rw, bool is_sync) +{ + unsigned long ts_diff; + unsigned int iotype = iostat_ctx->type; + unsigned long flags; + struct f2fs_sb_info *sbi = iostat_ctx->sbi; + struct iostat_lat_info *io_lat = sbi->iostat_io_lat; + int idx; + + if (!sbi->iostat_enable) + return; + + ts_diff = jiffies - iostat_ctx->submit_ts; + if (iotype >= META_FLUSH) + iotype = META; + + if (rw == 0) { + idx = READ_IO; + } else { + if (is_sync) + idx = WRITE_SYNC_IO; + else + idx = WRITE_ASYNC_IO; + } + + spin_lock_irqsave(&sbi->iostat_lat_lock, flags); + io_lat->sum_lat[idx][iotype] += ts_diff; + io_lat->bio_cnt[idx][iotype]++; + if (ts_diff > io_lat->peak_lat[idx][iotype]) + io_lat->peak_lat[idx][iotype] = ts_diff; + spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags); +} + +void iostat_update_and_unbind_ctx(struct bio *bio, int rw) +{ + struct bio_iostat_ctx *iostat_ctx = bio->bi_private; + bool is_sync = bio->bi_opf & REQ_SYNC; + + if (rw == 0) + bio->bi_private = iostat_ctx->post_read_ctx; + else + bio->bi_private = iostat_ctx->sbi; + __update_iostat_latency(iostat_ctx, rw, is_sync); + mempool_free(iostat_ctx, bio_iostat_ctx_pool); +} + +void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi, + struct bio *bio, struct bio_post_read_ctx *ctx) +{ + struct bio_iostat_ctx *iostat_ctx; + /* Due to the mempool, this never fails. */ + iostat_ctx = mempool_alloc(bio_iostat_ctx_pool, GFP_NOFS); + iostat_ctx->sbi = sbi; + iostat_ctx->submit_ts = 0; + iostat_ctx->type = 0; + iostat_ctx->post_read_ctx = ctx; + bio->bi_private = iostat_ctx; +} + +int __init f2fs_init_iostat_processing(void) +{ + bio_iostat_ctx_cache = + kmem_cache_create("f2fs_bio_iostat_ctx", + sizeof(struct bio_iostat_ctx), 0, 0, NULL); + if (!bio_iostat_ctx_cache) + goto fail; + bio_iostat_ctx_pool = + mempool_create_slab_pool(NUM_PREALLOC_IOSTAT_CTXS, + bio_iostat_ctx_cache); + if (!bio_iostat_ctx_pool) + goto fail_free_cache; + return 0; + +fail_free_cache: + kmem_cache_destroy(bio_iostat_ctx_cache); +fail: + return -ENOMEM; +} + +void f2fs_destroy_iostat_processing(void) +{ + mempool_destroy(bio_iostat_ctx_pool); + kmem_cache_destroy(bio_iostat_ctx_cache); +} + int f2fs_init_iostat(struct f2fs_sb_info *sbi) { /* init iostat info */ spin_lock_init(&sbi->iostat_lock); + spin_lock_init(&sbi->iostat_lat_lock); sbi->iostat_enable = false; sbi->iostat_period_ms = DEFAULT_IOSTAT_PERIOD_MS; + sbi->iostat_io_lat = f2fs_kzalloc(sbi, sizeof(struct iostat_lat_info), + GFP_KERNEL); + if (!sbi->iostat_io_lat) + return -ENOMEM; return 0; } + +void f2fs_destroy_iostat(struct f2fs_sb_info *sbi) +{ + kfree(sbi->iostat_io_lat); +} diff --git a/fs/f2fs/iostat.h b/fs/f2fs/iostat.h index 46e4a36fc8e9..22a2d01f57ef 100644 --- a/fs/f2fs/iostat.h +++ b/fs/f2fs/iostat.h @@ -6,6 +6,8 @@ #ifndef __F2FS_IOSTAT_H__ #define __F2FS_IOSTAT_H__ +struct bio_post_read_ctx; + #ifdef CONFIG_F2FS_IOSTAT #define DEFAULT_IOSTAT_PERIOD_MS 3000 @@ -13,15 +15,70 @@ /* maximum period of iostat tracing is 1 day */ #define MAX_IOSTAT_PERIOD_MS 8640000 +enum { + READ_IO, + WRITE_SYNC_IO, + WRITE_ASYNC_IO, + MAX_IO_TYPE, +}; + +struct iostat_lat_info { + unsigned long sum_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; /* sum of io latencies */ + unsigned long peak_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; /* peak io latency */ + unsigned int bio_cnt[MAX_IO_TYPE][NR_PAGE_TYPE]; /* bio count */ +}; + extern int __maybe_unused iostat_info_seq_show(struct seq_file *seq, void *offset); extern void f2fs_reset_iostat(struct f2fs_sb_info *sbi); extern void f2fs_update_iostat(struct f2fs_sb_info *sbi, enum iostat_type type, unsigned long long io_bytes); + +struct bio_iostat_ctx { + struct f2fs_sb_info *sbi; + unsigned long submit_ts; + enum page_type type; + struct bio_post_read_ctx *post_read_ctx; +}; + +static inline void iostat_update_submit_ctx(struct bio *bio, + enum page_type type) +{ + struct bio_iostat_ctx *iostat_ctx = bio->bi_private; + + iostat_ctx->submit_ts = jiffies; + iostat_ctx->type = type; +} + +static inline struct bio_post_read_ctx *get_post_read_ctx(struct bio *bio) +{ + struct bio_iostat_ctx *iostat_ctx = bio->bi_private; + + return iostat_ctx->post_read_ctx; +} + +extern void iostat_update_and_unbind_ctx(struct bio *bio, int rw); +extern void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi, + struct bio *bio, struct bio_post_read_ctx *ctx); +extern int f2fs_init_iostat_processing(void); +extern void f2fs_destroy_iostat_processing(void); extern int f2fs_init_iostat(struct f2fs_sb_info *sbi); +extern void f2fs_destroy_iostat(struct f2fs_sb_info *sbi); #else static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, enum iostat_type type, unsigned long long io_bytes) {} +static inline void iostat_update_and_unbind_ctx(struct bio *bio, int rw) {} +static inline void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi, + struct bio *bio, struct bio_post_read_ctx *ctx) {} +static inline void iostat_update_submit_ctx(struct bio *bio, + enum page_type type) {} +static inline struct bio_post_read_ctx *get_post_read_ctx(struct bio *bio) +{ + return bio->bi_private; +} +static inline int f2fs_init_iostat_processing(void) { return 0; } +static inline void f2fs_destroy_iostat_processing(void) {} static inline int f2fs_init_iostat(struct f2fs_sb_info *sbi) { return 0; } +static inline void f2fs_destroy_iostat(struct f2fs_sb_info *sbi) {} #endif #endif /* __F2FS_IOSTAT_H__ */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 88bf6b586ecf..23e92d971a80 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1568,6 +1568,7 @@ static void f2fs_put_super(struct super_block *sb) #endif fscrypt_free_dummy_context(&F2FS_OPTION(sbi).dummy_enc_ctx); destroy_percpu_info(sbi); + f2fs_destroy_iostat(sbi); for (i = 0; i < NR_PAGE_TYPE; i++) kvfree(sbi->write_io[i]); #ifdef CONFIG_UNICODE @@ -3960,7 +3961,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) err = init_percpu_info(sbi); if (err) - goto free_bio_info; + goto free_iostat; if (F2FS_IO_ALIGNED(sbi)) { sbi->write_io_dummy = @@ -4282,6 +4283,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) mempool_destroy(sbi->write_io_dummy); free_percpu: destroy_percpu_info(sbi); +free_iostat: + f2fs_destroy_iostat(sbi); free_bio_info: for (i = 0; i < NR_PAGE_TYPE; i++) kvfree(sbi->write_io[i]); @@ -4424,9 +4427,12 @@ static int __init init_f2fs_fs(void) err = f2fs_init_post_read_processing(); if (err) goto free_root_stats; - err = f2fs_init_bio_entry_cache(); + err = f2fs_init_iostat_processing(); if (err) goto free_post_read; + err = f2fs_init_bio_entry_cache(); + if (err) + goto free_iostat; err = f2fs_init_bioset(); if (err) goto free_bio_enrty_cache; @@ -4448,6 +4454,8 @@ static int __init init_f2fs_fs(void) f2fs_destroy_bioset(); free_bio_enrty_cache: f2fs_destroy_bio_entry_cache(); +free_iostat: + f2fs_destroy_iostat_processing(); free_post_read: f2fs_destroy_post_read_processing(); free_root_stats: @@ -4482,6 +4490,7 @@ static void __exit exit_f2fs_fs(void) f2fs_destroy_compress_mempool(); f2fs_destroy_bioset(); f2fs_destroy_bio_entry_cache(); + f2fs_destroy_iostat_processing(); f2fs_destroy_post_read_processing(); f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 1a856dce344b..349f390ba911 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1903,6 +1903,101 @@ TRACE_EVENT(f2fs_iostat, __entry->app_mrio, __entry->fs_drio, __entry->fs_gdrio, __entry->fs_cdrio, __entry->fs_nrio, __entry->fs_mrio) ); + +#ifndef __F2FS_IOSTAT_LATENCY_TYPE +#define __F2FS_IOSTAT_LATENCY_TYPE +struct f2fs_iostat_latency { + unsigned int peak_lat; + unsigned int avg_lat; + unsigned int cnt; +}; +#endif /* __F2FS_IOSTAT_LATENCY_TYPE */ + +TRACE_EVENT(f2fs_iostat_latency, + + TP_PROTO(struct f2fs_sb_info *sbi, struct f2fs_iostat_latency (*iostat_lat)[NR_PAGE_TYPE]), + + TP_ARGS(sbi, iostat_lat), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, d_rd_peak) + __field(unsigned int, d_rd_avg) + __field(unsigned int, d_rd_cnt) + __field(unsigned int, n_rd_peak) + __field(unsigned int, n_rd_avg) + __field(unsigned int, n_rd_cnt) + __field(unsigned int, m_rd_peak) + __field(unsigned int, m_rd_avg) + __field(unsigned int, m_rd_cnt) + __field(unsigned int, d_wr_s_peak) + __field(unsigned int, d_wr_s_avg) + __field(unsigned int, d_wr_s_cnt) + __field(unsigned int, n_wr_s_peak) + __field(unsigned int, n_wr_s_avg) + __field(unsigned int, n_wr_s_cnt) + __field(unsigned int, m_wr_s_peak) + __field(unsigned int, m_wr_s_avg) + __field(unsigned int, m_wr_s_cnt) + __field(unsigned int, d_wr_as_peak) + __field(unsigned int, d_wr_as_avg) + __field(unsigned int, d_wr_as_cnt) + __field(unsigned int, n_wr_as_peak) + __field(unsigned int, n_wr_as_avg) + __field(unsigned int, n_wr_as_cnt) + __field(unsigned int, m_wr_as_peak) + __field(unsigned int, m_wr_as_avg) + __field(unsigned int, m_wr_as_cnt) + ), + + TP_fast_assign( + __entry->dev = sbi->sb->s_dev; + __entry->d_rd_peak = iostat_lat[0][DATA].peak_lat; + __entry->d_rd_avg = iostat_lat[0][DATA].avg_lat; + __entry->d_rd_cnt = iostat_lat[0][DATA].cnt; + __entry->n_rd_peak = iostat_lat[0][NODE].peak_lat; + __entry->n_rd_avg = iostat_lat[0][NODE].avg_lat; + __entry->n_rd_cnt = iostat_lat[0][NODE].cnt; + __entry->m_rd_peak = iostat_lat[0][META].peak_lat; + __entry->m_rd_avg = iostat_lat[0][META].avg_lat; + __entry->m_rd_cnt = iostat_lat[0][META].cnt; + __entry->d_wr_s_peak = iostat_lat[1][DATA].peak_lat; + __entry->d_wr_s_avg = iostat_lat[1][DATA].avg_lat; + __entry->d_wr_s_cnt = iostat_lat[1][DATA].cnt; + __entry->n_wr_s_peak = iostat_lat[1][NODE].peak_lat; + __entry->n_wr_s_avg = iostat_lat[1][NODE].avg_lat; + __entry->n_wr_s_cnt = iostat_lat[1][NODE].cnt; + __entry->m_wr_s_peak = iostat_lat[1][META].peak_lat; + __entry->m_wr_s_avg = iostat_lat[1][META].avg_lat; + __entry->m_wr_s_cnt = iostat_lat[1][META].cnt; + __entry->d_wr_as_peak = iostat_lat[2][DATA].peak_lat; + __entry->d_wr_as_avg = iostat_lat[2][DATA].avg_lat; + __entry->d_wr_as_cnt = iostat_lat[2][DATA].cnt; + __entry->n_wr_as_peak = iostat_lat[2][NODE].peak_lat; + __entry->n_wr_as_avg = iostat_lat[2][NODE].avg_lat; + __entry->n_wr_as_cnt = iostat_lat[2][NODE].cnt; + __entry->m_wr_as_peak = iostat_lat[2][META].peak_lat; + __entry->m_wr_as_avg = iostat_lat[2][META].avg_lat; + __entry->m_wr_as_cnt = iostat_lat[2][META].cnt; + ), + + TP_printk("dev = (%d,%d), " + "iotype [peak lat.(ms)/avg lat.(ms)/count], " + "rd_data [%u/%u/%u], rd_node [%u/%u/%u], rd_meta [%u/%u/%u], " + "wr_sync_data [%u/%u/%u], wr_sync_node [%u/%u/%u], " + "wr_sync_meta [%u/%u/%u], wr_async_data [%u/%u/%u], " + "wr_async_node [%u/%u/%u], wr_async_meta [%u/%u/%u]", + show_dev(__entry->dev), + __entry->d_rd_peak, __entry->d_rd_avg, __entry->d_rd_cnt, + __entry->n_rd_peak, __entry->n_rd_avg, __entry->n_rd_cnt, + __entry->m_rd_peak, __entry->m_rd_avg, __entry->m_rd_cnt, + __entry->d_wr_s_peak, __entry->d_wr_s_avg, __entry->d_wr_s_cnt, + __entry->n_wr_s_peak, __entry->n_wr_s_avg, __entry->n_wr_s_cnt, + __entry->m_wr_s_peak, __entry->m_wr_s_avg, __entry->m_wr_s_cnt, + __entry->d_wr_as_peak, __entry->d_wr_as_avg, __entry->d_wr_as_cnt, + __entry->n_wr_as_peak, __entry->n_wr_as_avg, __entry->n_wr_as_cnt, + __entry->m_wr_as_peak, __entry->m_wr_as_avg, __entry->m_wr_as_cnt) +); #endif TRACE_EVENT(f2fs_bmap, From 8f5d3b4d95df786e81d65d80e5c11db283d26c5e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 20 Aug 2021 18:54:59 +0800 Subject: [PATCH 313/580] f2fs: rebuild nat_bits during umount If all free_nat_bitmap are available, we can rebuild nat_bits from free_nat_bitmap entirely during umount, let's make another chance to reenable nat_bits for image. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 21 ++++++--- fs/f2fs/f2fs.h | 32 +------------- fs/f2fs/node.c | 101 +++++++++++++++++++++++++++++++++---------- 3 files changed, 95 insertions(+), 59 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index e3b828cd660d..0b3b1b53b356 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1303,12 +1303,20 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long flags; - spin_lock_irqsave(&sbi->cp_lock, flags); + if (cpc->reason & CP_UMOUNT) { + if (le32_to_cpu(ckpt->cp_pack_total_block_count) > + sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) { + clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG); + f2fs_notice(sbi, "Disable nat_bits due to no space"); + } else if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG) && + f2fs_nat_bitmap_enabled(sbi)) { + f2fs_enable_nat_bits(sbi); + set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); + f2fs_notice(sbi, "Rebuild and enable nat_bits"); + } + } - if ((cpc->reason & CP_UMOUNT) && - le32_to_cpu(ckpt->cp_pack_total_block_count) > - sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) - disable_nat_bits(sbi, false); + spin_lock_irqsave(&sbi->cp_lock, flags); if (cpc->reason & CP_TRIMMED) __set_ckpt_flags(ckpt, CP_TRIMMED_FLAG); @@ -1494,7 +1502,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) start_blk = __start_cp_next_addr(sbi); /* write nat bits */ - if (enabled_nat_bits(sbi, cpc)) { + if ((cpc->reason & CP_UMOUNT) && + is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) { __u64 cp_ver = cur_cp_version(ckpt); block_t blk; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c6df19b0937f..6b2269da5da0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2077,36 +2077,6 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) spin_unlock_irqrestore(&sbi->cp_lock, flags); } -static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock) -{ - unsigned long flags; - unsigned char *nat_bits; - - /* - * In order to re-enable nat_bits we need to call fsck.f2fs by - * set_sbi_flag(sbi, SBI_NEED_FSCK). But it may give huge cost, - * so let's rely on regular fsck or unclean shutdown. - */ - - if (lock) - spin_lock_irqsave(&sbi->cp_lock, flags); - __clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG); - nat_bits = NM_I(sbi)->nat_bits; - NM_I(sbi)->nat_bits = NULL; - if (lock) - spin_unlock_irqrestore(&sbi->cp_lock, flags); - - kvfree(nat_bits); -} - -static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi, - struct cp_control *cpc) -{ - bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); - - return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set; -} - static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { down_read(&sbi->cp_rwsem); @@ -3464,6 +3434,7 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from); int f2fs_truncate_xattr_node(struct inode *inode); int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, unsigned int seq_id); +bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi); int f2fs_remove_inode_page(struct inode *inode); struct page *f2fs_new_inode_page(struct inode *inode); struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs); @@ -3488,6 +3459,7 @@ int f2fs_recover_xattr_data(struct inode *inode, struct page *page); int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum); +void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi); int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); int f2fs_build_node_manager(struct f2fs_sb_info *sbi); void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 2356a047e9fe..4181546de0f8 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2213,6 +2213,24 @@ static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i, } } +bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int i; + bool ret = true; + + down_read(&nm_i->nat_tree_lock); + for (i = 0; i < nm_i->nat_blocks; i++) { + if (!test_bit_le(i, nm_i->nat_block_bitmap)) { + ret = false; + break; + } + } + up_read(&nm_i->nat_tree_lock); + + return ret; +} + static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, bool set, bool build) { @@ -2884,7 +2902,23 @@ static void __adjust_nat_entry_set(struct nat_entry_set *nes, list_add_tail(&nes->set_list, head); } -static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, +static void __update_nat_bits(struct f2fs_nm_info *nm_i, unsigned int nat_ofs, + unsigned int valid) +{ + if (valid == 0) { + __set_bit_le(nat_ofs, nm_i->empty_nat_bits); + __clear_bit_le(nat_ofs, nm_i->full_nat_bits); + return; + } + + __clear_bit_le(nat_ofs, nm_i->empty_nat_bits); + if (valid == NAT_ENTRY_PER_BLOCK) + __set_bit_le(nat_ofs, nm_i->full_nat_bits); + else + __clear_bit_le(nat_ofs, nm_i->full_nat_bits); +} + +static void update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, struct page *page) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -2893,7 +2927,7 @@ static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, int valid = 0; int i = 0; - if (!enabled_nat_bits(sbi, NULL)) + if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) return; if (nat_index == 0) { @@ -2904,17 +2938,36 @@ static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, if (le32_to_cpu(nat_blk->entries[i].block_addr) != NULL_ADDR) valid++; } - if (valid == 0) { - __set_bit_le(nat_index, nm_i->empty_nat_bits); - __clear_bit_le(nat_index, nm_i->full_nat_bits); - return; + + __update_nat_bits(nm_i, nat_index, valid); +} + +void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_ofs; + + down_read(&nm_i->nat_tree_lock); + + for (nat_ofs = 0; nat_ofs < nm_i->nat_blocks; nat_ofs++) { + unsigned int valid = 0, nid_ofs = 0; + + /* handle nid zero due to it should never be used */ + if (unlikely(nat_ofs == 0)) { + valid = 1; + nid_ofs = 1; + } + + for (; nid_ofs < NAT_ENTRY_PER_BLOCK; nid_ofs++) { + if (!test_bit_le(nid_ofs, + nm_i->free_nid_bitmap[nat_ofs])) + valid++; + } + + __update_nat_bits(nm_i, nat_ofs, valid); } - __clear_bit_le(nat_index, nm_i->empty_nat_bits); - if (valid == NAT_ENTRY_PER_BLOCK) - __set_bit_le(nat_index, nm_i->full_nat_bits); - else - __clear_bit_le(nat_index, nm_i->full_nat_bits); + up_read(&nm_i->nat_tree_lock); } static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, @@ -2933,7 +2986,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, * #1, flush nat entries to journal in current hot data summary block. * #2, flush nat entries to nat page. */ - if (enabled_nat_bits(sbi, cpc) || + if ((cpc->reason & CP_UMOUNT) || !__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) to_journal = false; @@ -2980,7 +3033,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, if (to_journal) { up_write(&curseg->journal_rwsem); } else { - __update_nat_bits(sbi, start_nid, page); + update_nat_bits(sbi, start_nid, page); f2fs_put_page(page, 1); } @@ -3011,7 +3064,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * during unmount, let's flush nat_bits before checking * nat_cnt[DIRTY_NAT]. */ - if (enabled_nat_bits(sbi, cpc)) { + if (cpc->reason & CP_UMOUNT) { down_write(&nm_i->nat_tree_lock); remove_nats_in_journal(sbi); up_write(&nm_i->nat_tree_lock); @@ -3027,7 +3080,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * entries, remove all entries from journal and merge them * into nat entry set. */ - if (enabled_nat_bits(sbi, cpc) || + if (cpc->reason & CP_UMOUNT || !__has_cursum_space(journal, nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL)) remove_nats_in_journal(sbi); @@ -3064,15 +3117,18 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) __u64 cp_ver = cur_cp_version(ckpt); block_t nat_bits_addr; - if (!enabled_nat_bits(sbi, NULL)) - return 0; - nm_i->nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8); nm_i->nat_bits = f2fs_kvzalloc(sbi, nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, GFP_KERNEL); if (!nm_i->nat_bits) return -ENOMEM; + nm_i->full_nat_bits = nm_i->nat_bits + 8; + nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes; + + if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) + return 0; + nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg - nm_i->nat_bits_blocks; for (i = 0; i < nm_i->nat_bits_blocks; i++) { @@ -3089,13 +3145,12 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) cp_ver |= (cur_cp_crc(ckpt) << 32); if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) { - disable_nat_bits(sbi, true); + clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG); + f2fs_notice(sbi, "Disable nat_bits due to incorrect cp_ver (%llu, %llu)", + cp_ver, le64_to_cpu(*(__le64 *)nm_i->nat_bits)); return 0; } - nm_i->full_nat_bits = nm_i->nat_bits + 8; - nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes; - f2fs_notice(sbi, "Found nat_bits in checkpoint"); return 0; } @@ -3106,7 +3161,7 @@ static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) unsigned int i = 0; nid_t nid, last_nid; - if (!enabled_nat_bits(sbi, NULL)) + if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) return; for (i = 0; i < nm_i->nat_blocks; i++) { From e180d13a39a07a2f27b0c241a1fccda689aed2dc Mon Sep 17 00:00:00 2001 From: Fengnan Chang Date: Thu, 19 Aug 2021 16:02:37 +0800 Subject: [PATCH 314/580] f2fs: Don't create discard thread when device doesn't support realtime discard Don't create discard thread when device doesn't support realtime discard or user specifies nodiscard mount option. Signed-off-by: Fengnan Chang Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/segment.c | 25 +++++++++++++++++++------ fs/f2fs/super.c | 27 ++++++++++++++++++++++++++- 3 files changed, 46 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6b2269da5da0..97dcdbcd7052 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3483,6 +3483,7 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi); void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); +int f2fs_start_discard_thread(struct f2fs_sb_info *sbi); void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi); void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi); bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 418c3ef00d9c..389f52927981 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2114,9 +2114,25 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, wake_up_discard_thread(sbi, false); } -static int create_discard_cmd_control(struct f2fs_sb_info *sbi) +int f2fs_start_discard_thread(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + int err = 0; + + if (!f2fs_realtime_discard_enable(sbi)) + return 0; + + dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi, + "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(dcc->f2fs_issue_discard)) + err = PTR_ERR(dcc->f2fs_issue_discard); + + return err; +} + +static int create_discard_cmd_control(struct f2fs_sb_info *sbi) +{ struct discard_cmd_control *dcc; int err = 0, i; @@ -2154,13 +2170,10 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) init_waitqueue_head(&dcc->discard_wait_queue); SM_I(sbi)->dcc_info = dcc; init_thread: - dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi, - "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev)); - if (IS_ERR(dcc->f2fs_issue_discard)) { - err = PTR_ERR(dcc->f2fs_issue_discard); + err = f2fs_start_discard_thread(sbi); + if (err) { kfree(dcc); SM_I(sbi)->dcc_info = NULL; - return err; } return err; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 23e92d971a80..622391187413 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2092,12 +2092,15 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool need_restart_gc = false, need_stop_gc = false; bool need_restart_ckpt = false, need_stop_ckpt = false; bool need_restart_flush = false, need_stop_flush = false; + bool need_restart_discard = false, need_stop_discard = false; bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT); bool no_io_align = !F2FS_IO_ALIGNED(sbi); bool no_atgc = !test_opt(sbi, ATGC); + bool no_discard = !test_opt(sbi, DISCARD); bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE); bool block_unit_discard = f2fs_block_unit_discard(sbi); + struct discard_cmd_control *dcc; #ifdef CONFIG_QUOTA int i, j; #endif @@ -2269,11 +2272,26 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) need_stop_flush = true; } + if (no_discard == !!test_opt(sbi, DISCARD)) { + if (test_opt(sbi, DISCARD)) { + err = f2fs_start_discard_thread(sbi); + if (err) + goto restore_flush; + need_stop_discard = true; + } else { + dcc = SM_I(sbi)->dcc_info; + f2fs_stop_discard_thread(sbi); + if (atomic_read(&dcc->discard_cmd_cnt)) + f2fs_issue_discard_timeout(sbi); + need_restart_discard = true; + } + } + if (enable_checkpoint == !!test_opt(sbi, DISABLE_CHECKPOINT)) { if (test_opt(sbi, DISABLE_CHECKPOINT)) { err = f2fs_disable_checkpoint(sbi); if (err) - goto restore_flush; + goto restore_discard; } else { f2fs_enable_checkpoint(sbi); } @@ -2293,6 +2311,13 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) adjust_unusable_cap_perc(sbi); *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); return 0; +restore_discard: + if (need_restart_discard) { + if (f2fs_start_discard_thread(sbi)) + f2fs_warn(sbi, "discard has been stopped"); + } else if (need_stop_discard) { + f2fs_stop_discard_thread(sbi); + } restore_flush: if (need_restart_flush) { if (f2fs_create_flush_cmd_control(sbi)) From 24bc3b723575973b08834fc1ed1776aec60d5bfb Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 24 Aug 2021 08:11:38 +0800 Subject: [PATCH 315/580] f2fs: adjust unlock order for cleanup This patch adjusts unlock order of .i_mmap_sem and .i_gc_rwsem for cleanup. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index fe609f24688f..33715001046e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -3644,8 +3644,8 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) released_blocks += ret; } - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); out: inode_unlock(inode); @@ -3797,8 +3797,8 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) reserved_blocks += ret; } - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (ret >= 0) { clear_inode_flag(inode, FI_COMPRESS_RELEASED); From 102cf1c406622fb213a06fee4f0eab14c28921b4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 24 Aug 2021 08:12:08 +0800 Subject: [PATCH 316/580] f2fs: fix to account missing .skipped_gc_rwsem There is a missing place we forgot to account .skipped_gc_rwsem, fix it. Fixes: 6f8d4455060d ("f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 2c18443972b6..77391e3b7d68 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1500,8 +1500,10 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, int err; if (S_ISREG(inode->i_mode)) { - if (!down_write_trylock(&fi->i_gc_rwsem[READ])) + if (!down_write_trylock(&fi->i_gc_rwsem[READ])) { + sbi->skipped_gc_rwsem++; continue; + } if (!down_write_trylock( &fi->i_gc_rwsem[WRITE])) { sbi->skipped_gc_rwsem++; From d13130b664b99ac46bdc73268f5e2117c2ab395d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 26 Aug 2021 10:03:15 +0800 Subject: [PATCH 317/580] f2fs: fix unexpected ENOENT comes from f2fs_map_blocks() In below path, it will return ENOENT if filesystem is shutdown: - f2fs_map_blocks - f2fs_get_dnode_of_data - f2fs_get_node_page - __get_node_page - read_node_page - is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) return -ENOENT - force return value from ENOENT to 0 It should be fine for read case, since it indicates a hole condition, and caller could use .m_next_pgofs to skip the hole and continue the lookup. However it may cause confusing for write case, since leaving a hole there, and said nothing was wrong doesn't help. There is at least one case from dax_iomap_actor() will complain that, so fix this in prior to supporting dax in f2fs. xfstest generic/388 reports below warning: ubuntu godown: xfstests-induced forced shutdown of /mnt/scratch_f2fs: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 485833 at fs/dax.c:1127 dax_iomap_actor+0x339/0x370 Call Trace: iomap_apply+0x1c4/0x7b0 ? dax_iomap_rw+0x1c0/0x1c0 dax_iomap_rw+0xad/0x1c0 ? dax_iomap_rw+0x1c0/0x1c0 f2fs_file_write_iter+0x5ab/0x970 [f2fs] do_iter_readv_writev+0x273/0x2e0 do_iter_write+0xab/0x1f0 vfs_iter_write+0x21/0x40 iter_file_splice_write+0x287/0x540 do_splice+0x37c/0xa60 __x64_sys_splice+0x15f/0x3a0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae ubuntu godown: xfstests-induced forced shutdown of /mnt/scratch_f2fs: ------------[ cut here ]------------ RIP: 0010:dax_iomap_pte_fault.isra.0+0x72e/0x14a0 Call Trace: dax_iomap_fault+0x44/0x70 f2fs_dax_huge_fault+0x155/0x400 [f2fs] f2fs_dax_fault+0x18/0x30 [f2fs] __do_fault+0x4e/0x120 do_fault+0x3cf/0x7a0 __handle_mm_fault+0xa8c/0xf20 ? find_held_lock+0x39/0xd0 handle_mm_fault+0x1b6/0x480 do_user_addr_fault+0x320/0xcd0 ? rcu_read_lock_sched_held+0x67/0xc0 exc_page_fault+0x77/0x3f0 ? asm_exc_page_fault+0x8/0x30 asm_exc_page_fault+0x1e/0x30 Fixes: 83a3bfdb5a8a ("f2fs: indicate shutdown f2fs to allow unmount successfully") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 84a793dff4d2..91badbd6c464 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1466,7 +1466,21 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, if (err) { if (flag == F2FS_GET_BLOCK_BMAP) map->m_pblk = 0; + if (err == -ENOENT) { + /* + * There is one exceptional case that read_node_page() + * may return -ENOENT due to filesystem has been + * shutdown or cp_error, so force to convert error + * number to EIO for such case. + */ + if (map->m_may_create && + (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || + f2fs_cp_error(sbi))) { + err = -EIO; + goto unlock_out; + } + err = 0; if (map->m_next_pgofs) *map->m_next_pgofs = From 4cedafe4c823df35b9bf61780f6a75fe881b1081 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 25 Aug 2021 19:34:19 +0800 Subject: [PATCH 318/580] f2fs: fix to unmap pages from userspace process in punch_hole() We need to unmap pages from userspace process before removing pagecache in punch_hole() like we did in f2fs_setattr(). Similar change: commit 5e44f8c374dc ("ext4: hole-punch use truncate_pagecache_range") Fixes: fbfa2cc58d53 ("f2fs: add file operations") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 33715001046e..3816c7d8f6a5 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1123,7 +1123,6 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) } if (pg_start < pg_end) { - struct address_space *mapping = inode->i_mapping; loff_t blk_start, blk_end; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -1135,8 +1134,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); down_write(&F2FS_I(inode)->i_mmap_sem); - truncate_inode_pages_range(mapping, blk_start, - blk_end - 1); + truncate_pagecache_range(inode, blk_start, blk_end - 1); f2fs_lock_op(sbi); ret = f2fs_truncate_hole(inode, pg_start, pg_end); From 6608ba097fa7dced01895348972f1c3d1abb1977 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 19 Aug 2021 14:00:57 -0700 Subject: [PATCH 319/580] f2fs: guarantee to write dirty data when enabling checkpoint back We must flush all the dirty data when enabling checkpoint back. Let's guarantee that first by adding a retry logic on sync_inodes_sb(). In addition to that, this patch adds to flush data in fsync when checkpoint is disabled, which can mitigate the sync_inodes_sb() failures in advance. Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 ++--- fs/f2fs/super.c | 11 ++++++++++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 3816c7d8f6a5..8e35fb117200 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -261,8 +261,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, }; unsigned int seq_id = 0; - if (unlikely(f2fs_readonly(inode->i_sb) || - is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + if (unlikely(f2fs_readonly(inode->i_sb))) return 0; trace_f2fs_sync_file_enter(inode); @@ -276,7 +275,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, ret = file_write_and_wait_range(file, start, end); clear_inode_flag(inode, FI_NEED_IPU); - if (ret) { + if (ret || is_sbi_flag_set(sbi, SBI_CP_DISABLED)) { trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret); return ret; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 622391187413..f094a218ee64 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2070,8 +2070,17 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) { + int retry = DEFAULT_RETRY_IO_COUNT; + /* we should flush all the data to keep data consistency */ - sync_inodes_sb(sbi->sb); + do { + sync_inodes_sb(sbi->sb); + cond_resched(); + congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); + } while (get_pages(sbi, F2FS_DIRTY_DATA) && retry--); + + if (unlikely(retry < 0)) + f2fs_warn(sbi, "checkpoint=enable has some unwritten data."); down_write(&sbi->gc_lock); f2fs_dirty_to_prefree(sbi); From dc085c6db2c9d8d88e517ea2a1bd6e849d181172 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 30 Aug 2021 08:35:33 +0800 Subject: [PATCH 320/580] f2fs: enable realtime discard iff device supports discard Let's only enable realtime discard if and only if device supports discard functionality. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f094a218ee64..f4478107ea9c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -658,10 +658,14 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) return -EINVAL; break; case Opt_discard: + if (!f2fs_hw_support_discard(sbi)) { + f2fs_warn(sbi, "device does not support discard"); + break; + } set_opt(sbi, DISCARD); break; case Opt_nodiscard: - if (f2fs_sb_has_blkzoned(sbi)) { + if (f2fs_hw_should_discard(sbi)) { f2fs_warn(sbi, "discard is required for zoned block devices"); return -EINVAL; } @@ -1988,7 +1992,8 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).unusable_cap = 0; sbi->sb->s_flags |= SB_LAZYTIME; set_opt(sbi, FLUSH_MERGE); - set_opt(sbi, DISCARD); + if (f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) + set_opt(sbi, DISCARD); if (f2fs_sb_has_blkzoned(sbi)) { F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS; F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_SECTION; From 0b6a2759857312a31c95e85b4925de05a85260f3 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 30 Aug 2021 11:37:32 -0700 Subject: [PATCH 321/580] f2fs: deallocate compressed pages when error happens In f2fs_write_multi_pages(), f2fs_compress_pages() allocates pages for compression work in cc->cpages[]. Then, f2fs_write_compressed_pages() initiates bio submission. But, if there's any error before submitting the IOs like early f2fs_cp_error(), previously it didn't free cpages by f2fs_compress_free_page(). Let's fix memory leak by putting that just before deallocating cc->cpages. Fixes: 4c8ff7095bef ("f2fs: support data compression") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index df6f08c2613c..4e1cd596cd1a 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1365,12 +1365,6 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, for (--i; i >= 0; i--) fscrypt_finalize_bounce_page(&cc->cpages[i]); - for (i = 0; i < cc->nr_cpages; i++) { - if (!cc->cpages[i]) - continue; - f2fs_compress_free_page(cc->cpages[i]); - cc->cpages[i] = NULL; - } out_put_cic: kmem_cache_free(cic_entry_slab, cic); out_put_dnode: @@ -1381,6 +1375,12 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, else f2fs_unlock_op(sbi); out_free: + for (i = 0; i < cc->nr_cpages; i++) { + if (!cc->cpages[i]) + continue; + f2fs_compress_free_page(cc->cpages[i]); + cc->cpages[i] = NULL; + } page_array_free(cc->inode, cc->cpages, cc->nr_cpages); cc->cpages = NULL; return -EAGAIN; From 814f6a843b9ef2515638ac347d85daf865aa620c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 30 Aug 2021 13:30:45 -0700 Subject: [PATCH 322/580] f2fs: should put a page beyond EOF when preparing a write The prepare_compress_overwrite() gets/locks a page to prepare a read, and calls f2fs_read_multi_pages() which checks EOF first. If there's any page beyond EOF, we unlock the page and set cc->rpages[i] = NULL, which we can't put the page anymore. This makes page leak, so let's fix by putting that page. Fixes: a949dc5f2c5c ("f2fs: compress: fix race condition of overwrite vs truncate") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 91badbd6c464..c8e41a296790 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2144,6 +2144,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, continue; } unlock_page(page); + if (for_write) + put_page(page); cc->rpages[i] = NULL; cc->nr_rpages--; } From e004c33e702a5176a3010fb9939ba2ec947e8285 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 7 Sep 2021 10:24:21 -0700 Subject: [PATCH 323/580] f2fs: should use GFP_NOFS for directory inodes We use inline_dentry which requires to allocate dentry page when adding a link. If we allow to reclaim memory from filesystem, we do down_read(&sbi->cp_rwsem) twice by f2fs_lock_op(). I think this should be okay, but how about stopping the lockdep complaint [1]? f2fs_create() - f2fs_lock_op() - f2fs_do_add_link() - __f2fs_find_entry - f2fs_get_read_data_page() -> kswapd - shrink_node - f2fs_evict_inode - f2fs_lock_op() [1] fs_reclaim ){+.+.}-{0:0} : kswapd0: lock_acquire+0x114/0x394 kswapd0: __fs_reclaim_acquire+0x40/0x50 kswapd0: prepare_alloc_pages+0x94/0x1ec kswapd0: __alloc_pages_nodemask+0x78/0x1b0 kswapd0: pagecache_get_page+0x2e0/0x57c kswapd0: f2fs_get_read_data_page+0xc0/0x394 kswapd0: f2fs_find_data_page+0xa4/0x23c kswapd0: find_in_level+0x1a8/0x36c kswapd0: __f2fs_find_entry+0x70/0x100 kswapd0: f2fs_do_add_link+0x84/0x1ec kswapd0: f2fs_mkdir+0xe4/0x1e4 kswapd0: vfs_mkdir+0x110/0x1c0 kswapd0: do_mkdirat+0xa4/0x160 kswapd0: __arm64_sys_mkdirat+0x24/0x34 kswapd0: el0_svc_common.llvm.17258447499513131576+0xc4/0x1e8 kswapd0: do_el0_svc+0x28/0xa0 kswapd0: el0_svc+0x24/0x38 kswapd0: el0_sync_handler+0x88/0xec kswapd0: el0_sync+0x1c0/0x200 kswapd0: -> #1 ( &sbi->cp_rwsem ){++++}-{3:3} : kswapd0: lock_acquire+0x114/0x394 kswapd0: down_read+0x7c/0x98 kswapd0: f2fs_do_truncate_blocks+0x78/0x3dc kswapd0: f2fs_truncate+0xc8/0x128 kswapd0: f2fs_evict_inode+0x2b8/0x8b8 kswapd0: evict+0xd4/0x2f8 kswapd0: iput+0x1c0/0x258 kswapd0: do_unlinkat+0x170/0x2a0 kswapd0: __arm64_sys_unlinkat+0x4c/0x68 kswapd0: el0_svc_common.llvm.17258447499513131576+0xc4/0x1e8 kswapd0: do_el0_svc+0x28/0xa0 kswapd0: el0_svc+0x24/0x38 kswapd0: el0_sync_handler+0x88/0xec kswapd0: el0_sync+0x1c0/0x200 Cc: stable@vger.kernel.org Fixes: bdbc90fa55af ("f2fs: don't put dentry page in pagecache into highmem") Reviewed-by: Chao Yu Reviewed-by: Stanley Chu Reviewed-by: Light Hsieh Tested-by: Light Hsieh Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 2 +- fs/f2fs/namei.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 9e32c338052d..5d180bade6e4 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -527,7 +527,7 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) inode->i_op = &f2fs_dir_inode_operations; inode->i_fop = &f2fs_dir_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; - inode_nohighmem(inode); + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); } else if (S_ISLNK(inode->i_mode)) { if (file_is_encrypt(inode)) inode->i_op = &f2fs_encrypted_symlink_inode_operations; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a21144415604..654938c9f686 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -752,7 +752,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_op = &f2fs_dir_inode_operations; inode->i_fop = &f2fs_dir_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; - inode_nohighmem(inode); + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); set_inode_flag(inode, FI_INC_LINK); f2fs_lock_op(sbi); From e7fea02e79918774f6856a0459335f07951840f2 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 3 Sep 2021 10:38:11 +0800 Subject: [PATCH 324/580] f2fs: quota: fix potential deadlock As Yi Zhuang reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=214299 There is potential deadlock during quota data flush as below: Thread A: Thread B: f2fs_dquot_acquire down_read(&sbi->quota_sem) f2fs_write_checkpoint block_operations f2fs_look_all down_write(&sbi->cp_rwsem) f2fs_quota_write f2fs_write_begin __do_map_lock f2fs_lock_op down_read(&sbi->cp_rwsem) __need_flush_qutoa down_write(&sbi->quota_sem) This patch changes block_operations() to use trylock, if it fails, it means there is potential quota data updater, in this condition, let's flush quota data first and then trylock again to check dirty status of quota data. The side effect is: in heavy race condition (e.g. multi quota data upaters vs quota data flusher), it may decrease the probability of synchronizing quota data successfully in checkpoint() due to limited retry time of quota flush. Reported-by: Yi Zhuang Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 0b3b1b53b356..e34d60becafc 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1162,7 +1162,8 @@ static bool __need_flush_quota(struct f2fs_sb_info *sbi) if (!is_journalled_quota(sbi)) return false; - down_write(&sbi->quota_sem); + if (!down_write_trylock(&sbi->quota_sem)) + return true; if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) { ret = false; } else if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR)) { From a21312a15fc08250649e9689757d9c7a0a967adf Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 1 Sep 2021 16:06:21 +0800 Subject: [PATCH 325/580] f2fs: avoid attaching SB_ACTIVE flag during mount Quoted from [1] "I do remember that I've added this code back then because otherwise orphan cleanup was losing updates to quota files. But you're right that now I don't see how that could be happening and it would be nice if we could get rid of this hack" [1] https://lore.kernel.org/linux-ext4/99cce8ca-e4a0-7301-840f-2ace67c551f3@huawei.com/T/#m04990cfbc4f44592421736b504afcc346b2a7c00 Related fix in ext4 by commit 72ffb49a7b62 ("ext4: do not set SB_ACTIVE in ext4_orphan_cleanup()"). f2fs has the same hack implementation in - f2fs_recover_orphan_inodes() - f2fs_recover_fsync_data() Let's get rid of this hack as well in f2fs. Cc: Zhang Yi Cc: Jan Kara Acked-by: Jan Kara Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 3 --- fs/f2fs/recovery.c | 8 ++------ 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index e34d60becafc..e4d2ec540b08 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -705,9 +705,6 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) } #ifdef CONFIG_QUOTA - /* Needed for iput() to work correctly and not trash data */ - sbi->sb->s_flags |= SB_ACTIVE; - /* * Turn on quotas which were not enabled for read-only mounts if * filesystem has quota feature, so that they are updated correctly. diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 1528a43123f7..d44e7c9d7397 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -786,8 +786,6 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) } #ifdef CONFIG_QUOTA - /* Needed for iput() to work correctly and not trash data */ - sbi->sb->s_flags |= SB_ACTIVE; /* Turn on quotas so that they are updated correctly */ quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY); #endif @@ -815,10 +813,8 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) err = recover_data(sbi, &inode_list, &tmp_inode_list, &dir_list); if (!err) f2fs_bug_on(sbi, !list_empty(&inode_list)); - else { - /* restore s_flags to let iput() trash data */ - sbi->sb->s_flags = s_flags; - } + else + f2fs_bug_on(sbi, sbi->sb->s_flags & SB_ACTIVE); skip: destroy_fsync_dnodes(&inode_list, err); destroy_fsync_dnodes(&tmp_inode_list, err); From 568b500362ab88c24728f7a20aed382502efd6d7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 16 Sep 2021 17:09:03 +0800 Subject: [PATCH 326/580] f2fs: introduce excess_dirty_threshold() This patch enables f2fs_balance_fs_bg() to check all metadatas' dirty threshold rather than just checking node block's, so that checkpoint() from background can be triggered more frequently to avoid heaping up too much dirty metadatas. Threshold value by default: race with foreground ops single type global No 16MB 24MB Yes 24MB 36MB In addtion, let f2fs_balance_fs_bg() be aware of roll-forward sapce as well as fsync(). Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 +++ fs/f2fs/node.h | 5 ----- fs/f2fs/segment.c | 23 +++++++++++++++++++++-- 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 97dcdbcd7052..142f8f1a8215 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -560,6 +560,9 @@ enum { #define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */ +/* dirty segments threshold for triggering CP */ +#define DEFAULT_DIRTY_THRESHOLD 4 + /* for in-memory extent cache entry */ #define F2FS_MIN_EXTENT_LEN 64 /* minimum extent length */ diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index ff14a6e5ac1c..18b98cf0465b 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -138,11 +138,6 @@ static inline bool excess_cached_nats(struct f2fs_sb_info *sbi) return NM_I(sbi)->nat_cnt[TOTAL_NAT] >= DEF_NAT_CACHE_THRESHOLD; } -static inline bool excess_dirty_nodes(struct f2fs_sb_info *sbi) -{ - return get_pages(sbi, F2FS_DIRTY_NODES) >= sbi->blocks_per_seg * 8; -} - enum mem_type { FREE_NIDS, /* indicates the free nid list */ NAT_ENTRIES, /* indicates the cached nat entry */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 389f52927981..378f5f37fa45 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -529,6 +529,25 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) } } +static inline bool excess_dirty_threshold(struct f2fs_sb_info *sbi) +{ + int factor = rwsem_is_locked(&sbi->cp_rwsem) ? 3 : 2; + unsigned int dents = get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int qdata = get_pages(sbi, F2FS_DIRTY_QDATA); + unsigned int nodes = get_pages(sbi, F2FS_DIRTY_NODES); + unsigned int meta = get_pages(sbi, F2FS_DIRTY_META); + unsigned int imeta = get_pages(sbi, F2FS_DIRTY_IMETA); + unsigned int threshold = sbi->blocks_per_seg * factor * + DEFAULT_DIRTY_THRESHOLD; + unsigned int global_threshold = threshold * 3 / 2; + + if (dents >= threshold || qdata >= threshold || + nodes >= threshold || meta >= threshold || + imeta >= threshold) + return true; + return dents + qdata + nodes + meta + imeta > global_threshold; +} + void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) { if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) @@ -547,8 +566,8 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) else f2fs_build_free_nids(sbi, false, false); - if (excess_dirty_nats(sbi) || excess_dirty_nodes(sbi) || - excess_prefree_segs(sbi)) + if (excess_dirty_nats(sbi) || excess_dirty_threshold(sbi) || + excess_prefree_segs(sbi) || !f2fs_space_for_roll_forward(sbi)) goto do_sync; /* there is background inflight IO or foreground operation recently */ From e4caa6c2ea442e8421f5ee5b8ea140369609e442 Mon Sep 17 00:00:00 2001 From: Weichao Guo Date: Sat, 18 Sep 2021 20:46:36 +0800 Subject: [PATCH 327/580] f2fs: set SBI_NEED_FSCK flag when inconsistent node block found Inconsistent node block will cause a file fail to open or read, which could make the user process crashes or stucks. Let's mark SBI_NEED_FSCK flag to trigger a fix at next fsck time. After unlinking the corrupted file, the user process could regenerate a new one and work correctly. Signed-off-by: Weichao Guo Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 4181546de0f8..dae0b055fa91 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1443,6 +1443,7 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, nid, nid_of_node(page), ino_of_node(page), ofs_of_node(page), cpver_of_node(page), next_blkaddr_of_node(page)); + set_sbi_flag(sbi, SBI_NEED_FSCK); err = -EINVAL; out_err: ClearPageUptodate(page); From 06696f282ecb0339565a8eea7d668d592f849d7c Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 21 Sep 2021 22:37:30 +0800 Subject: [PATCH 328/580] f2fs: fix up f2fs_lookup tracepoints Fix up a misuse that the filename pointer isn't always valid in the ring buffer, and we should copy the content instead. Fixes: 0c5e36db17f5 ("f2fs: trace f2fs_lookup") Signed-off-by: Gao Xiang Signed-off-by: Jaegeuk Kim --- include/trace/events/f2fs.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 349f390ba911..92b23c98db26 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -816,20 +816,20 @@ TRACE_EVENT(f2fs_lookup_start, TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) - __field(const char *, name) + __string(name, dentry->d_name.name) __field(unsigned int, flags) ), TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->ino = dir->i_ino; - __entry->name = dentry->d_name.name; + __assign_str(name, dentry->d_name.name); __entry->flags = flags; ), TP_printk("dev = (%d,%d), pino = %lu, name:%s, flags:%u", show_dev_ino(__entry), - __entry->name, + __get_str(name), __entry->flags) ); @@ -843,7 +843,7 @@ TRACE_EVENT(f2fs_lookup_end, TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) - __field(const char *, name) + __string(name, dentry->d_name.name) __field(nid_t, cino) __field(int, err) ), @@ -851,14 +851,14 @@ TRACE_EVENT(f2fs_lookup_end, TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->ino = dir->i_ino; - __entry->name = dentry->d_name.name; + __assign_str(name, dentry->d_name.name); __entry->cino = ino; __entry->err = err; ), TP_printk("dev = (%d,%d), pino = %lu, name:%s, ino:%u, err:%d", show_dev_ino(__entry), - __entry->name, + __get_str(name), __entry->cino, __entry->err) ); From b141c2d8b0d6b7cefde75d2746e7eae02dc15095 Mon Sep 17 00:00:00 2001 From: Keoseong Park Date: Mon, 27 Sep 2021 15:06:48 +0900 Subject: [PATCH 329/580] f2fs: fix to use WHINT_MODE Since active_logs can be set to 2 or 4 or NR_CURSEG_PERSIST_TYPE(6), it cannot be set to NR_CURSEG_TYPE(8). That is, whint_mode is always off. Therefore, the condition is changed from NR_CURSEG_TYPE to NR_CURSEG_PERSIST_TYPE. Cc: Chao Yu Fixes: d0b9e42ab615 (f2fs: introduce inmem curseg) Reported-by: tanghuan Signed-off-by: Keoseong Park Signed-off-by: Fengnan Chang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f4478107ea9c..41b79a990692 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1274,7 +1274,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) /* Not pass down write hints if the number of active logs is lesser * than NR_CURSEG_PERSIST_TYPE. */ - if (F2FS_OPTION(sbi).active_logs != NR_CURSEG_TYPE) + if (F2FS_OPTION(sbi).active_logs != NR_CURSEG_PERSIST_TYPE) F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) { From 997755e2d4a8c18dc19d96b9b5fc339d3180d347 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 29 Sep 2021 03:19:14 +0800 Subject: [PATCH 330/580] f2fs: fix wrong condition to trigger background checkpoint correctly In f2fs_balance_fs_bg(), it needs to check both NAT_ENTRIES and INO_ENTRIES memory usage to decide whether we should skip background checkpoint, otherwise we may always skip checking INO_ENTRIES memory usage, so that INO_ENTRIES may potentially cause high memory footprint. Fixes: 493720a48543 ("f2fs: fix to avoid REQ_TIME and CP_TIME collision") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 378f5f37fa45..a67e67b095ef 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -580,7 +580,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) goto do_sync; /* checkpoint is the only way to shrink partial cached entries */ - if (f2fs_available_free_memory(sbi, NAT_ENTRIES) || + if (f2fs_available_free_memory(sbi, NAT_ENTRIES) && f2fs_available_free_memory(sbi, INO_ENTRIES)) return; From 6e58c1c56456082bb1d823cb75d9b4b076aeaf48 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Wed, 6 Oct 2021 10:49:10 -0700 Subject: [PATCH 331/580] f2fs: include non-compressed blocks in compr_written_block Need to include non-compressed blocks in compr_written_block to estimate average compression ratio more accurately. Fixes: 5ac443e26a09 ("f2fs: add sysfs nodes to get runtime compression stat") Cc: stable@vger.kernel.org Signed-off-by: Daeho Jeong Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 4e1cd596cd1a..cae4ceecb174 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1501,6 +1501,7 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, if (cluster_may_compress(cc)) { err = f2fs_compress_pages(cc); if (err == -EAGAIN) { + add_compr_block_stat(cc->inode, cc->cluster_size); goto write; } else if (err) { f2fs_put_rpages_wbc(cc, wbc, true, 1); From 1bc6e6d2169a220b1d46f09130e0d0fa3d2bda3f Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Wed, 29 Sep 2021 11:12:03 -0700 Subject: [PATCH 332/580] f2fs: introduce fragment allocation mode mount option Added two options into "mode=" mount option to make it possible for developers to simulate filesystem fragmentation/after-GC situation itself. The developers use these modes to understand filesystem fragmentation/after-GC condition well, and eventually get some insights to handle them better. "fragment:segment": f2fs allocates a new segment in ramdom position. With this, we can simulate the after-GC condition. "fragment:block" : We can scatter block allocation with "max_fragment_chunk" and "max_fragment_hole" sysfs nodes. f2fs will allocate 1.. blocks in a chunk and make a hole in the length of 1.. by turns in a newly allocated free segment. Plus, this mode implicitly enables "fragment:segment" option for more randomness. Reviewed-by: Chao Yu Signed-off-by: Daeho Jeong Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 16 ++++++++++++++++ Documentation/filesystems/f2fs.rst | 18 ++++++++++++++++++ fs/f2fs/f2fs.h | 19 +++++++++++++++++-- fs/f2fs/gc.c | 5 ++++- fs/f2fs/segment.c | 20 ++++++++++++++++++-- fs/f2fs/segment.h | 1 + fs/f2fs/super.c | 10 ++++++++++ fs/f2fs/sysfs.c | 20 ++++++++++++++++++++ 8 files changed, 104 insertions(+), 5 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 9c89ad131304..9d28f1d8a5aa 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -486,3 +486,19 @@ Date: July 2021 Contact: "Daeho Jeong" Description: You can control for which gc mode the "gc_reclaimed_segments" node shows. Refer to the description of the modes in "gc_reclaimed_segments". + +What: /sys/fs/f2fs//max_fragment_chunk +Date: August 2021 +Contact: "Daeho Jeong" +Description: With "mode=fragment:block" mount options, we can scatter block allocation. + f2fs will allocate 1.. blocks in a chunk and make a hole + in the length of 1.. by turns. This value can be set + between 1..512 and the default value is 4. + +What: /sys/fs/f2fs//max_fragment_hole +Date: August 2021 +Contact: "Daeho Jeong" +Description: With "mode=fragment:block" mount options, we can scatter block allocation. + f2fs will allocate 1.. blocks in a chunk and make a hole + in the length of 1.. by turns. This value can be set + between 1..512 and the default value is 4. diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 09de6ebbbdfa..4294db649fa8 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -201,6 +201,24 @@ fault_type=%d Support configuring fault injection type, should be mode=%s Control block allocation mode which supports "adaptive" and "lfs". In "lfs" mode, there should be no random writes towards main area. + "fragment:segment" and "fragment:block" are newly added here. + These are developer options for experiments to simulate filesystem + fragmentation/after-GC situation itself. The developers use these + modes to understand filesystem fragmentation/after-GC condition well, + and eventually get some insights to handle them better. + In "fragment:segment", f2fs allocates a new segment in ramdom + position. With this, we can simulate the after-GC condition. + In "fragment:block", we can scatter block allocation with + "max_fragment_chunk" and "max_fragment_hole" sysfs nodes. + We added some randomness to both chunk and hole size to make + it close to realistic IO pattern. So, in this mode, f2fs will allocate + 1.. blocks in a chunk and make a hole in the + length of 1.. by turns. With this, the newly + allocated blocks will be scattered throughout the whole partition. + Note that "fragment:block" implicitly enables "fragment:segment" + option for more randomness. + Please, use these options for your experiments and we strongly + recommend to re-format the filesystem after using these options. io_bits=%u Set the bit size of write IO requests. It should be set with "mode=lfs". usrquota Enable plain user disk quota accounting. diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 142f8f1a8215..049d73b3c600 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1287,8 +1287,10 @@ enum { }; enum { - FS_MODE_ADAPTIVE, /* use both lfs/ssr allocation */ - FS_MODE_LFS, /* use lfs allocation only */ + FS_MODE_ADAPTIVE, /* use both lfs/ssr allocation */ + FS_MODE_LFS, /* use lfs allocation only */ + FS_MODE_FRAGMENT_SEG, /* segment fragmentation mode */ + FS_MODE_FRAGMENT_BLK, /* block fragmentation mode */ }; enum { @@ -1763,6 +1765,9 @@ struct f2fs_sb_info { unsigned int gc_segment_mode; /* GC state for reclaimed segments */ unsigned int gc_reclaimed_segs[MAX_GC_MODE]; /* Reclaimed segs for each mode */ + int max_fragment_chunk; /* max chunk size for block fragmentation mode */ + int max_fragment_hole; /* max hole size for block fragmentation mode */ + #ifdef CONFIG_F2FS_FS_COMPRESSION struct kmem_cache *page_array_slab; /* page array entry */ unsigned int page_array_slab_size; /* default page array slab size */ @@ -3553,6 +3558,16 @@ unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi, unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, unsigned int segno); +#define DEF_FRAGMENT_SIZE 4 +#define MIN_FRAGMENT_SIZE 1 +#define MAX_FRAGMENT_SIZE 512 + +static inline bool f2fs_need_rand_seg(struct f2fs_sb_info *sbi) +{ + return F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_SEG || + F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK; +} + /* * checkpoint.c */ diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 77391e3b7d68..a946ce0ead34 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" @@ -257,7 +258,9 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, p->max_search = sbi->max_victim_search; /* let's select beginning hot/small space first in no_heap mode*/ - if (test_opt(sbi, NOHEAP) && + if (f2fs_need_rand_seg(sbi)) + p->offset = prandom_u32() % (MAIN_SECS(sbi) * sbi->segs_per_sec); + else if (test_opt(sbi, NOHEAP) && (type == CURSEG_HOT_DATA || IS_NODESEG(type))) p->offset = 0; else diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a67e67b095ef..1182f408be47 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "f2fs.h" #include "segment.h" @@ -2648,6 +2649,8 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) unsigned short seg_type = curseg->seg_type; sanity_check_seg_type(sbi, seg_type); + if (f2fs_need_rand_seg(sbi)) + return prandom_u32() % (MAIN_SECS(sbi) * sbi->segs_per_sec); /* if segs_per_sec is large than 1, we need to keep original policy. */ if (__is_large_section(sbi)) @@ -2699,6 +2702,9 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) curseg->next_segno = segno; reset_curseg(sbi, type, 1); curseg->alloc_type = LFS; + if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) + curseg->fragment_remained_chunk = + prandom_u32() % sbi->max_fragment_chunk + 1; } static int __next_free_blkoff(struct f2fs_sb_info *sbi, @@ -2725,12 +2731,22 @@ static int __next_free_blkoff(struct f2fs_sb_info *sbi, static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, struct curseg_info *seg) { - if (seg->alloc_type == SSR) + if (seg->alloc_type == SSR) { seg->next_blkoff = __next_free_blkoff(sbi, seg->segno, seg->next_blkoff + 1); - else + } else { seg->next_blkoff++; + if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) { + /* To allocate block chunks in different sizes, use random number */ + if (--seg->fragment_remained_chunk <= 0) { + seg->fragment_remained_chunk = + prandom_u32() % sbi->max_fragment_chunk + 1; + seg->next_blkoff += + prandom_u32() % sbi->max_fragment_hole + 1; + } + } + } } bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index c030b4436617..355b797ab171 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -314,6 +314,7 @@ struct curseg_info { unsigned short next_blkoff; /* next block offset to write */ unsigned int zone; /* current zone number */ unsigned int next_segno; /* preallocated segment */ + int fragment_remained_chunk; /* remained block size in a chunk for block fragmentation mode */ bool inited; /* indicate inmem log is inited */ }; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 41b79a990692..55362595cf33 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -814,6 +814,10 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE; } else if (!strcmp(name, "lfs")) { F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS; + } else if (!strcmp(name, "fragment:segment")) { + F2FS_OPTION(sbi).fs_mode = FS_MODE_FRAGMENT_SEG; + } else if (!strcmp(name, "fragment:block")) { + F2FS_OPTION(sbi).fs_mode = FS_MODE_FRAGMENT_BLK; } else { kfree(name); return -EINVAL; @@ -1889,6 +1893,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, "adaptive"); else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS) seq_puts(seq, "lfs"); + else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_SEG) + seq_puts(seq, "fragment:segment"); + else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) + seq_puts(seq, "fragment:block"); seq_printf(seq, ",active_logs=%u", F2FS_OPTION(sbi).active_logs); if (test_opt(sbi, RESERVE_ROOT)) seq_printf(seq, ",reserve_root=%u,resuid=%u,resgid=%u", @@ -3490,6 +3498,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->next_victim_seg[FG_GC] = NULL_SEGNO; sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH; sbi->migration_granularity = sbi->segs_per_sec; + sbi->max_fragment_chunk = DEF_FRAGMENT_SIZE; + sbi->max_fragment_hole = DEF_FRAGMENT_SIZE; sbi->dir_level = DEF_DIR_LEVEL; sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 40a6abdb7a27..49d775ecf2bf 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -546,6 +546,22 @@ static ssize_t __sbi_store(struct f2fs_attr *a, return count; } + if (!strcmp(a->attr.name, "max_fragment_chunk")) { + if (t >= MIN_FRAGMENT_SIZE && t <= MAX_FRAGMENT_SIZE) + sbi->max_fragment_chunk = t; + else + return -EINVAL; + return count; + } + + if (!strcmp(a->attr.name, "max_fragment_hole")) { + if (t >= MIN_FRAGMENT_SIZE && t <= MAX_FRAGMENT_SIZE) + sbi->max_fragment_hole = t; + else + return -EINVAL; + return count; + } + *ui = (unsigned int)t; return count; @@ -775,6 +791,8 @@ F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_threshold, age_threshold); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_segment_mode, gc_segment_mode); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_reclaimed_segments, gc_reclaimed_segs); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_fragment_chunk, max_fragment_chunk); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_fragment_hole, max_fragment_hole); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -852,6 +870,8 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(atgc_age_threshold), ATTR_LIST(gc_segment_mode), ATTR_LIST(gc_reclaimed_segments), + ATTR_LIST(max_fragment_chunk), + ATTR_LIST(max_fragment_hole), NULL, }; From 55577feb0363690ebb9ca8c178e9998186de7fbb Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 1 Sep 2021 14:39:20 +0800 Subject: [PATCH 333/580] f2fs: multidevice: support direct IO Commit 3c62be17d4f5 ("f2fs: support multiple devices") missed to support direct IO for multiple device feature, this patch adds to support the missing part of multidevice feature. In addition, for multiple device image, we should be aware of any issued direct write IO rather than just buffered write IO, so that fsync and syncfs can issue a preflush command to the device where direct write IO goes, to persist user data for posix compliant. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 57 +++++++++++++++++++++++++++++++++++-- fs/f2fs/f2fs.h | 25 ++++++++++++++-- fs/f2fs/segment.c | 35 ++++++++++++++--------- fs/f2fs/super.c | 7 +++++ include/trace/events/f2fs.h | 21 ++++++++++---- 5 files changed, 120 insertions(+), 25 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index c8e41a296790..112964cfdd5f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1427,10 +1427,15 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, struct extent_info ei = {0, }; block_t blkaddr; unsigned int start_pgofs; + int bidx = 0; if (!maxblocks) return 0; + map->m_bdev = inode->i_sb->s_bdev; + map->m_multidev_dio = + f2fs_allow_multi_device_dio(F2FS_I_SB(inode), flag); + map->m_len = 0; map->m_flags = 0; @@ -1453,6 +1458,21 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, if (flag == F2FS_GET_BLOCK_DIO) f2fs_wait_on_block_writeback_range(inode, map->m_pblk, map->m_len); + + if (map->m_multidev_dio) { + block_t blk_addr = map->m_pblk; + + bidx = f2fs_target_device_index(sbi, map->m_pblk); + + map->m_bdev = FDEV(bidx).bdev; + map->m_pblk -= FDEV(bidx).start_blk; + map->m_len = min(map->m_len, + FDEV(bidx).end_blk + 1 - map->m_pblk); + + if (map->m_may_create) + f2fs_update_device_state(sbi, inode->i_ino, + blk_addr, map->m_len); + } goto out; } @@ -1571,6 +1591,9 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, if (flag == F2FS_GET_BLOCK_PRE_AIO) goto skip; + if (map->m_multidev_dio) + bidx = f2fs_target_device_index(sbi, blkaddr); + if (map->m_len == 0) { /* preallocated unwritten block should be mapped for fiemap. */ if (blkaddr == NEW_ADDR) @@ -1579,10 +1602,15 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, map->m_pblk = blkaddr; map->m_len = 1; + + if (map->m_multidev_dio) + map->m_bdev = FDEV(bidx).bdev; } else if ((map->m_pblk != NEW_ADDR && blkaddr == (map->m_pblk + ofs)) || (map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) || flag == F2FS_GET_BLOCK_PRE_DIO) { + if (map->m_multidev_dio && map->m_bdev != FDEV(bidx).bdev) + goto sync_out; ofs++; map->m_len++; } else { @@ -1635,11 +1663,31 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, sync_out: - /* for hardware encryption, but to avoid potential issue in future */ - if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED) + if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED) { + /* + * for hardware encryption, but to avoid potential issue + * in future + */ f2fs_wait_on_block_writeback_range(inode, map->m_pblk, map->m_len); + if (map->m_multidev_dio) { + block_t blk_addr = map->m_pblk; + + bidx = f2fs_target_device_index(sbi, map->m_pblk); + + map->m_bdev = FDEV(bidx).bdev; + map->m_pblk -= FDEV(bidx).start_blk; + + if (map->m_may_create) + f2fs_update_device_state(sbi, inode->i_ino, + blk_addr, map->m_len); + + f2fs_bug_on(sbi, blk_addr + map->m_len > + FDEV(bidx).end_blk + 1); + } + } + if (flag == F2FS_GET_BLOCK_PRECACHE) { if (map->m_flags & F2FS_MAP_MAPPED) { unsigned int ofs = start_pgofs - map->m_lblk; @@ -1658,7 +1706,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, f2fs_balance_fs(sbi, dn.node_changed); } out: - trace_f2fs_map_blocks(inode, map, err); + trace_f2fs_map_blocks(inode, map, create, flag, err); return err; } @@ -1717,6 +1765,9 @@ static int __get_data_block(struct inode *inode, sector_t iblock, map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags; bh->b_size = blks_to_bytes(inode, map.m_len); + + if (map.m_multidev_dio) + bh->b_bdev = map.m_bdev; } return err; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 049d73b3c600..6e26f3a74792 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -619,6 +619,7 @@ struct extent_tree { F2FS_MAP_UNWRITTEN) struct f2fs_map_blocks { + struct block_device *m_bdev; /* for multi-device dio */ block_t m_pblk; block_t m_lblk; unsigned int m_len; @@ -627,6 +628,7 @@ struct f2fs_map_blocks { pgoff_t *m_next_extent; /* point to next possible extent */ int m_seg_type; bool m_may_create; /* indicate it is from write path */ + bool m_multidev_dio; /* indicate it allows multi-device dio */ }; /* for flag in get_data_block */ @@ -1739,12 +1741,15 @@ struct f2fs_sb_info { /* For shrinker support */ struct list_head s_list; + struct mutex umount_mutex; + unsigned int shrinker_run_no; + + /* For multi devices */ int s_ndevs; /* number of devices */ struct f2fs_dev_info *devs; /* for device list */ unsigned int dirty_device; /* for checkpoint data flush */ spinlock_t dev_lock; /* protect dirty_device */ - struct mutex umount_mutex; - unsigned int shrinker_run_no; + bool aligned_blksize; /* all devices has the same logical blksize */ /* For write statistics */ u64 sectors_written_start; @@ -3536,6 +3541,8 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, struct f2fs_io_info *fio); +void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino, + block_t blkaddr, unsigned int blkcnt); void f2fs_wait_on_page_writeback(struct page *page, enum page_type type, bool ordered, bool locked); void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr); @@ -4370,6 +4377,16 @@ static inline int block_unaligned_IO(struct inode *inode, return align & blocksize_mask; } +static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi, + int flag) +{ + if (!f2fs_is_multi_device(sbi)) + return false; + if (flag != F2FS_GET_BLOCK_DIO) + return false; + return sbi->aligned_blksize; +} + static inline bool f2fs_force_buffered_io(struct inode *inode, struct kiocb *iocb, struct iov_iter *iter) { @@ -4378,7 +4395,9 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, if (f2fs_post_read_required(inode)) return true; - if (f2fs_is_multi_device(sbi)) + + /* disallow direct IO if any of devices has unaligned blksize */ + if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize) return true; /* * for blkzoned device, fallback direct IO to buffered IO, so diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1182f408be47..62446bfc3d65 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3519,24 +3519,30 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, up_read(&SM_I(sbi)->curseg_lock); } -static void update_device_state(struct f2fs_io_info *fio) +void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino, + block_t blkaddr, unsigned int blkcnt) { - struct f2fs_sb_info *sbi = fio->sbi; - unsigned int devidx; - if (!f2fs_is_multi_device(sbi)) return; - devidx = f2fs_target_device_index(sbi, fio->new_blkaddr); + while (1) { + unsigned int devidx = f2fs_target_device_index(sbi, blkaddr); + unsigned int blks = FDEV(devidx).end_blk - blkaddr + 1; - /* update device state for fsync */ - f2fs_set_dirty_device(sbi, fio->ino, devidx, FLUSH_INO); + /* update device state for fsync */ + f2fs_set_dirty_device(sbi, ino, devidx, FLUSH_INO); - /* update device state for checkpoint */ - if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) { - spin_lock(&sbi->dev_lock); - f2fs_set_bit(devidx, (char *)&sbi->dirty_device); - spin_unlock(&sbi->dev_lock); + /* update device state for checkpoint */ + if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) { + spin_lock(&sbi->dev_lock); + f2fs_set_bit(devidx, (char *)&sbi->dirty_device); + spin_unlock(&sbi->dev_lock); + } + + if (blkcnt <= blks) + break; + blkcnt -= blks; + blkaddr += blks; } } @@ -3563,7 +3569,7 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) goto reallocate; } - update_device_state(fio); + f2fs_update_device_state(fio->sbi, fio->ino, fio->new_blkaddr, 1); if (keep_order) up_read(&fio->sbi->io_order_lock); @@ -3652,7 +3658,8 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) else err = f2fs_submit_page_bio(fio); if (!err) { - update_device_state(fio); + f2fs_update_device_state(fio->sbi, fio->ino, + fio->new_blkaddr, 1); f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 55362595cf33..120739af82aa 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3717,6 +3717,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); unsigned int max_devices = MAX_DEVICES; + unsigned int logical_blksize; int i; /* Initialize single device information */ @@ -3737,6 +3738,9 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) if (!sbi->devs) return -ENOMEM; + logical_blksize = bdev_logical_block_size(sbi->sb->s_bdev); + sbi->aligned_blksize = true; + for (i = 0; i < max_devices; i++) { if (i > 0 && !RDEV(i).path[0]) @@ -3773,6 +3777,9 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) /* to release errored devices */ sbi->s_ndevs = i + 1; + if (logical_blksize != bdev_logical_block_size(FDEV(i).bdev)) + sbi->aligned_blksize = false; + #ifdef CONFIG_BLK_DEV_ZONED if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM && !f2fs_sb_has_blkzoned(sbi)) { diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 92b23c98db26..04f8b5fd567a 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -579,9 +579,10 @@ TRACE_EVENT(f2fs_file_write_iter, ); TRACE_EVENT(f2fs_map_blocks, - TP_PROTO(struct inode *inode, struct f2fs_map_blocks *map, int ret), + TP_PROTO(struct inode *inode, struct f2fs_map_blocks *map, + int create, int flag, int ret), - TP_ARGS(inode, map, ret), + TP_ARGS(inode, map, create, flag, ret), TP_STRUCT__entry( __field(dev_t, dev) @@ -592,11 +593,14 @@ TRACE_EVENT(f2fs_map_blocks, __field(unsigned int, m_flags) __field(int, m_seg_type) __field(bool, m_may_create) + __field(bool, m_multidev_dio) + __field(int, create) + __field(int, flag) __field(int, ret) ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev = map->m_bdev->bd_dev; __entry->ino = inode->i_ino; __entry->m_lblk = map->m_lblk; __entry->m_pblk = map->m_pblk; @@ -604,12 +608,16 @@ TRACE_EVENT(f2fs_map_blocks, __entry->m_flags = map->m_flags; __entry->m_seg_type = map->m_seg_type; __entry->m_may_create = map->m_may_create; + __entry->m_multidev_dio = map->m_multidev_dio; + __entry->create = create; + __entry->flag = flag; __entry->ret = ret; ), TP_printk("dev = (%d,%d), ino = %lu, file offset = %llu, " - "start blkaddr = 0x%llx, len = 0x%llx, flags = %u," - "seg_type = %d, may_create = %d, err = %d", + "start blkaddr = 0x%llx, len = 0x%llx, flags = %u, " + "seg_type = %d, may_create = %d, multidevice = %d, " + "create = %d, flag = %d, err = %d", show_dev_ino(__entry), (unsigned long long)__entry->m_lblk, (unsigned long long)__entry->m_pblk, @@ -617,6 +625,9 @@ TRACE_EVENT(f2fs_map_blocks, __entry->m_flags, __entry->m_seg_type, __entry->m_may_create, + __entry->m_multidev_dio, + __entry->create, + __entry->flag, __entry->ret) ); From ba64cd1684389d9c41da7ad24998db21d7e69ac3 Mon Sep 17 00:00:00 2001 From: Fengnan Chang Date: Fri, 22 Oct 2021 20:08:00 -0700 Subject: [PATCH 334/580] f2fs: compress: fix overwrite may reduce compress ratio unproperly when overwrite only first block of cluster, since cluster is not full, it will call f2fs_write_raw_pages when f2fs_write_multi_pages, and cause the whole cluster become uncompressed eventhough data can be compressed. this may will make random write bench score reduce a lot. root# dd if=/dev/zero of=./fio-test bs=1M count=1 root# sync root# echo 3 > /proc/sys/vm/drop_caches root# f2fs_io get_cblocks ./fio-test root# dd if=/dev/zero of=./fio-test bs=4K count=1 oflag=direct conv=notrunc w/o patch: root# f2fs_io get_cblocks ./fio-test 189 w/ patch: root# f2fs_io get_cblocks ./fio-test 192 Signed-off-by: Fengnan Chang Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 19 +++++++++++++++++++ fs/f2fs/data.c | 36 ++++++++++++++++++------------------ fs/f2fs/f2fs.h | 2 ++ 3 files changed, 39 insertions(+), 18 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index cae4ceecb174..9d5c9fc90c36 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -852,6 +852,25 @@ bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index) return is_page_in_cluster(cc, index); } +bool f2fs_all_cluster_page_loaded(struct compress_ctx *cc, struct pagevec *pvec, + int index, int nr_pages) +{ + unsigned long pgidx; + int i; + + if (nr_pages - index < cc->cluster_size) + return false; + + pgidx = pvec->pages[index]->index; + + for (i = 1; i < cc->cluster_size; i++) { + if (pvec->pages[index + i]->index != pgidx + i) + return false; + } + + return true; +} + static bool cluster_has_invalid_data(struct compress_ctx *cc) { loff_t i_size = i_size_read(cc->inode); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 112964cfdd5f..4f4a5698b225 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3030,6 +3030,10 @@ static int f2fs_write_cache_pages(struct address_space *mapping, need_readd = false; #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { + void *fsdata = NULL; + struct page *pagep; + int ret2; + ret = f2fs_init_compress_ctx(&cc); if (ret) { done = 1; @@ -3048,27 +3052,23 @@ static int f2fs_write_cache_pages(struct address_space *mapping, if (unlikely(f2fs_cp_error(sbi))) goto lock_page; - if (f2fs_cluster_is_empty(&cc)) { - void *fsdata = NULL; - struct page *pagep; - int ret2; + if (!f2fs_cluster_is_empty(&cc)) + goto lock_page; - ret2 = f2fs_prepare_compress_overwrite( + ret2 = f2fs_prepare_compress_overwrite( inode, &pagep, page->index, &fsdata); - if (ret2 < 0) { - ret = ret2; - done = 1; - break; - } else if (ret2 && - !f2fs_compress_write_end(inode, - fsdata, page->index, - 1)) { - retry = 1; - break; - } - } else { - goto lock_page; + if (ret2 < 0) { + ret = ret2; + done = 1; + break; + } else if (ret2 && + (!f2fs_compress_write_end(inode, + fsdata, page->index, 1) || + !f2fs_all_cluster_page_loaded(&cc, + &pvec, i, nr_pages))) { + retry = 1; + break; } } #endif diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6e26f3a74792..c61d644d9703 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4086,6 +4086,8 @@ void f2fs_end_read_compressed_page(struct page *page, bool failed, block_t blkaddr); bool f2fs_cluster_is_empty(struct compress_ctx *cc); bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index); +bool f2fs_all_cluster_page_loaded(struct compress_ctx *cc, struct pagevec *pvec, + int index, int nr_pages); bool f2fs_sanity_check_cluster(struct dnode_of_data *dn); void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page); int f2fs_write_multi_pages(struct compress_ctx *cc, From 0c5e9113fe1fb8858b572e98ee3db8c246040129 Mon Sep 17 00:00:00 2001 From: Hyeong-Jun Kim Date: Wed, 27 Oct 2021 13:16:00 +0900 Subject: [PATCH 335/580] f2fs: compress: disallow disabling compress on non-empty compressed file Compresse file and normal file has differ in i_addr addressing, specifically addrs per inode/block. So, we will face data loss, if we disable the compression flag on non-empty files. Therefore we should disallow not only enabling but disabling the compression flag on non-empty files. Fixes: 4c8ff7095bef ("f2fs: support data compression") Signed-off-by: Sungjong Seo Signed-off-by: Hyeong-Jun Kim Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c61d644d9703..053fd0b77c1f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4213,8 +4213,7 @@ static inline bool f2fs_disable_compressed_file(struct inode *inode) if (!f2fs_compressed_file(inode)) return true; - if (S_ISREG(inode->i_mode) && - (get_dirty_pages(inode) || atomic_read(&fi->i_compr_blocks))) + if (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode)) return false; fi->i_flags &= ~F2FS_COMPR_FL; From 03ac6540fa19cc4c4d6f80bbf748d288190291c9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 28 Oct 2021 20:45:08 +0800 Subject: [PATCH 336/580] f2fs: fix incorrect return value in f2fs_sanity_check_ckpt() As Pavel Machek reported in [1] This code looks quite confused: part of function returns 1 on corruption, part returns -errno. The problem is not stable-specific. [1] https://lkml.org/lkml/2021/9/19/207 Let's fix to make 'insane cp_payload case' to return 1 rater than EFSCORRUPTED, so that return value can be kept consistent for all error cases, it can avoid confusion of code logic. Fixes: 65ddf6564843 ("f2fs: fix to do sanity check for sb/cp fields correctly") Reported-by: Pavel Machek Reviewed-by: Pavel Machek Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 120739af82aa..6024f6f2fc91 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3463,7 +3463,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) NR_CURSEG_PERSIST_TYPE + nat_bits_blocks >= blocks_per_seg)) { f2fs_warn(sbi, "Insane cp_payload: %u, nat_bits_blocks: %u)", cp_payload, nat_bits_blocks); - return -EFSCORRUPTED; + return 1; } if (unlikely(f2fs_cp_error(sbi))) { From ed0c1f0ce1dcd0668597c87583fbc4250681c5b1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 28 Oct 2021 21:03:05 +0800 Subject: [PATCH 337/580] f2fs: support fault injection for dquot_initialize() This patch adds a new function f2fs_dquot_initialize() to wrap dquot_initialize(), and it supports to inject fault into f2fs_dquot_initialize() to simulate inner failure occurs in dquot_initialize(). Usage: a) echo 65536 > /sys/fs/f2fs//inject_type or b) mount -o fault_type=65536 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 1 + fs/f2fs/checkpoint.c | 2 +- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/file.c | 6 +++--- fs/f2fs/inline.c | 2 +- fs/f2fs/inode.c | 2 +- fs/f2fs/namei.c | 30 +++++++++++++++--------------- fs/f2fs/recovery.c | 6 +++--- fs/f2fs/super.c | 16 ++++++++++++++++ fs/f2fs/verity.c | 2 +- fs/f2fs/xattr.c | 2 +- 11 files changed, 45 insertions(+), 26 deletions(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 4294db649fa8..6954c04753ad 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -197,6 +197,7 @@ fault_type=%d Support configuring fault injection type, should be FAULT_DISCARD 0x000002000 FAULT_WRITE_IO 0x000004000 FAULT_SLAB_ALLOC 0x000008000 + FAULT_DQUOT_INIT 0x000010000 =================== =========== mode=%s Control block allocation mode which supports "adaptive" and "lfs". In "lfs" mode, there should be no random diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index e4d2ec540b08..48beee4ceb3b 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -653,7 +653,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) return PTR_ERR(inode); } - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) { iput(inode); goto err_out; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 053fd0b77c1f..9a4913a1dbc6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -54,6 +54,7 @@ enum { FAULT_DISCARD, FAULT_WRITE_IO, FAULT_SLAB_ALLOC, + FAULT_DQUOT_INIT, FAULT_MAX, }; @@ -3412,6 +3413,7 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) */ int f2fs_inode_dirtied(struct inode *inode, bool sync); void f2fs_inode_synced(struct inode *inode); +int f2fs_dquot_initialize(struct inode *inode); int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly); int f2fs_quota_sync(struct super_block *sb, int type); loff_t max_file_blocks(struct inode *inode); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 8e35fb117200..508d1a44bace 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -806,7 +806,7 @@ int f2fs_truncate(struct inode *inode) return -EIO; } - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) return err; @@ -937,7 +937,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) return err; if (is_quota_modification(inode, attr)) { - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) return err; } @@ -3101,7 +3101,7 @@ static int f2fs_ioc_setproject(struct file *filp, __u32 projid) } f2fs_put_page(ipage, 1); - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) return err; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 3308faef0d10..9ec16011ce05 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -191,7 +191,7 @@ int f2fs_convert_inline_inode(struct inode *inode) f2fs_hw_is_readonly(sbi) || f2fs_readonly(sbi->sb)) return 0; - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) return err; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 5d180bade6e4..b79b714431af 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -754,7 +754,7 @@ void f2fs_evict_inode(struct inode *inode) if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) { err = 0; set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 654938c9f686..8c7ade66e857 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -69,7 +69,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns, F2FS_DEF_PROJID); - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) goto fail_drop; @@ -340,7 +340,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, if (!f2fs_is_checkpoint_ready(sbi)) return -ENOSPC; - err = dquot_initialize(dir); + err = f2fs_dquot_initialize(dir); if (err) return err; @@ -399,7 +399,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, F2FS_I(old_dentry->d_inode)->i_projid))) return -EXDEV; - err = dquot_initialize(dir); + err = f2fs_dquot_initialize(dir); if (err) return err; @@ -456,7 +456,7 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) return 0; } - err = dquot_initialize(dir); + err = f2fs_dquot_initialize(dir); if (err) return err; @@ -594,10 +594,10 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) goto fail; } - err = dquot_initialize(dir); + err = f2fs_dquot_initialize(dir); if (err) goto fail; - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) goto fail; @@ -671,7 +671,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, if (err) return err; - err = dquot_initialize(dir); + err = f2fs_dquot_initialize(dir); if (err) return err; @@ -741,7 +741,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (unlikely(f2fs_cp_error(sbi))) return -EIO; - err = dquot_initialize(dir); + err = f2fs_dquot_initialize(dir); if (err) return err; @@ -798,7 +798,7 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, if (!f2fs_is_checkpoint_ready(sbi)) return -ENOSPC; - err = dquot_initialize(dir); + err = f2fs_dquot_initialize(dir); if (err) return err; @@ -836,7 +836,7 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, struct inode *inode; int err; - err = dquot_initialize(dir); + err = f2fs_dquot_initialize(dir); if (err) return err; @@ -959,16 +959,16 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, return err; } - err = dquot_initialize(old_dir); + err = f2fs_dquot_initialize(old_dir); if (err) goto out; - err = dquot_initialize(new_dir); + err = f2fs_dquot_initialize(new_dir); if (err) goto out; if (new_inode) { - err = dquot_initialize(new_inode); + err = f2fs_dquot_initialize(new_inode); if (err) goto out; } @@ -1132,11 +1132,11 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, F2FS_I(new_dentry->d_inode)->i_projid))) return -EXDEV; - err = dquot_initialize(old_dir); + err = f2fs_dquot_initialize(old_dir); if (err) goto out; - err = dquot_initialize(new_dir); + err = f2fs_dquot_initialize(new_dir); if (err) goto out; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index d44e7c9d7397..656770e51a8b 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -81,7 +81,7 @@ static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi, if (IS_ERR(inode)) return ERR_CAST(inode); - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) goto err_out; @@ -203,7 +203,7 @@ static int recover_dentry(struct inode *inode, struct page *ipage, goto out_put; } - err = dquot_initialize(einode); + err = f2fs_dquot_initialize(einode); if (err) { iput(einode); goto out_put; @@ -508,7 +508,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, if (IS_ERR(inode)) return PTR_ERR(inode); - ret = dquot_initialize(inode); + ret = f2fs_dquot_initialize(inode); if (ret) { iput(inode); return ret; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 6024f6f2fc91..79d0f78ea5a2 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -57,6 +57,7 @@ const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_DISCARD] = "discard error", [FAULT_WRITE_IO] = "write IO error", [FAULT_SLAB_ALLOC] = "slab alloc", + [FAULT_DQUOT_INIT] = "dquot initialize", }; void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, @@ -2487,6 +2488,16 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, return len - towrite; } +int f2fs_dquot_initialize(struct inode *inode) +{ + if (time_to_inject(F2FS_I_SB(inode), FAULT_DQUOT_INIT)) { + f2fs_show_injection_info(F2FS_I_SB(inode), FAULT_DQUOT_INIT); + return -ESRCH; + } + + return dquot_initialize(inode); +} + static struct dquot **f2fs_get_dquots(struct inode *inode) { return F2FS_I(inode)->i_dquot; @@ -2871,6 +2882,11 @@ static const struct quotactl_ops f2fs_quotactl_ops = { .get_nextdqblk = dquot_get_next_dqblk, }; #else +int f2fs_dquot_initialize(struct inode *inode) +{ + return 0; +} + int f2fs_quota_sync(struct super_block *sb, int type) { return 0; diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index fb552a1eec81..f5e9a5e2cb88 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -136,7 +136,7 @@ static int f2fs_begin_enable_verity(struct file *filp) * here and not rely on ->open() doing it. This must be done before * evicting the inline data. */ - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) return err; diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 7f59039aef85..052b51c891b5 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -771,7 +771,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, if (!f2fs_is_checkpoint_ready(sbi)) return -ENOSPC; - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) return err; From 68c6b4345ea5adeb39767194d612798056cfbb87 Mon Sep 17 00:00:00 2001 From: Hyeong-Jun Kim Date: Tue, 2 Nov 2021 16:10:02 +0900 Subject: [PATCH 338/580] f2fs: invalidate META_MAPPING before IPU/DIO write Encrypted pages during GC are read and cached in META_MAPPING. However, due to cached pages in META_MAPPING, there is an issue where newly written pages are lost by IPU or DIO writes. Thread A - f2fs_gc() Thread B /* phase 3 */ down_write(i_gc_rwsem) ra_data_block() ---- (a) up_write(i_gc_rwsem) f2fs_direct_IO() : - down_read(i_gc_rwsem) - __blockdev_direct_io() - get_data_block_dio_write() - f2fs_dio_submit_bio() ---- (b) - up_read(i_gc_rwsem) /* phase 4 */ down_write(i_gc_rwsem) move_data_block() ---- (c) up_write(i_gc_rwsem) (a) In phase 3 of f2fs_gc(), up-to-date page is read from storage and cached in META_MAPPING. (b) In thread B, writing new data by IPU or DIO write on same blkaddr as read in (a). cached page in META_MAPPING become out-dated. (c) In phase 4 of f2fs_gc(), out-dated page in META_MAPPING is copied to new blkaddr. In conclusion, the newly written data in (b) is lost. To address this issue, invalidating pages in META_MAPPING before IPU or DIO write. Fixes: 6aa58d8ad20a ("f2fs: readahead encrypted block during GC") Signed-off-by: Hyeong-Jun Kim Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 ++ fs/f2fs/segment.c | 3 +++ 2 files changed, 5 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 4f4a5698b225..8d632b0c3848 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1670,6 +1670,8 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, */ f2fs_wait_on_block_writeback_range(inode, map->m_pblk, map->m_len); + invalidate_mapping_pages(META_MAPPING(sbi), + map->m_pblk, map->m_pblk); if (map->m_multidev_dio) { block_t blk_addr = map->m_pblk; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 62446bfc3d65..56df37331870 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3651,6 +3651,9 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) goto drop_bio; } + invalidate_mapping_pages(META_MAPPING(sbi), + fio->new_blkaddr, fio->new_blkaddr); + stat_inc_inplace_blocks(fio->sbi); if (fio->bio && !(SM_I(sbi)->ipu_policy & (1 << F2FS_IPU_NOCACHE))) From c79c4827e0ed88f2a1c8aae428ccf6da3c104bfe Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Thu, 4 Nov 2021 16:22:01 +0800 Subject: [PATCH 339/580] f2fs: fix UAF in f2fs_available_free_memory if2fs_fill_super -> f2fs_build_segment_manager -> create_discard_cmd_control -> f2fs_start_discard_thread It invokes kthread_run to create a thread and run issue_discard_thread. However, if f2fs_build_node_manager fails, the control flow goes to free_nm and calls f2fs_destroy_node_manager. This function will free sbi->nm_info. However, if issue_discard_thread accesses sbi->nm_info after the deallocation, but before the f2fs_stop_discard_thread, it will cause UAF(Use-after-free). -> f2fs_destroy_segment_manager -> destroy_discard_cmd_control -> f2fs_stop_discard_thread Fix this by stopping discard thread before f2fs_destroy_node_manager. Note that, the commit d6d2b491a82e1 introduces the call of f2fs_available_free_memory into issue_discard_thread. Cc: stable@vger.kernel.org Fixes: d6d2b491a82e ("f2fs: allow to change discard policy based on cached discard cmds") Signed-off-by: Dongliang Mu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 79d0f78ea5a2..ae3ec331a657 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4334,6 +4334,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) free_stats: f2fs_destroy_stats(sbi); free_nm: + /* stop discard thread before destroying node manager */ + f2fs_stop_discard_thread(sbi); f2fs_destroy_node_manager(sbi); free_sm: f2fs_destroy_segment_manager(sbi); From e0ecd6f0ccab874233fc2d09fe11d7c88958f443 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 30 Nov 2021 10:22:39 -0800 Subject: [PATCH 340/580] f2fs: check nr_pages for readahead Old kernel versions have the issue, since upstream f2fs removed this flow by: commit d4384de9eb5e ("f2fs: Remove readahead collision detection"). Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8d632b0c3848..6c532027d5eb 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2360,6 +2360,10 @@ int f2fs_mpage_readpages(struct address_space *mapping, int ret = 0; bool drop_ra = false; + /* this is real from f2fs_merkle_tree_readahead() in old kernel only. */ + if (!nr_pages) + return 0; + map.m_pblk = 0; map.m_lblk = 0; map.m_len = 0; From 70ac913a0820e9d689c0768e3c6ed51a148d8b54 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 29 Nov 2021 10:36:12 -0800 Subject: [PATCH 341/580] f2fs: show number of pending discard commands This information can be used to check how much time we need to give to issue all the discard commands. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 5 +++++ fs/f2fs/sysfs.c | 11 +++++++++++ 2 files changed, 16 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 9d28f1d8a5aa..ad1b2c65333c 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -102,6 +102,11 @@ Contact: "Jaegeuk Kim" Description: Set timeout to issue discard commands during umount. Default: 5 secs +What: /sys/fs/f2fs//pending_discard +Date: November 2021 +Contact: "Jaegeuk Kim" +Description: Shows the number of pending discard commands in the queue. + What: /sys/fs/f2fs//max_victim_search Date: January 2014 Contact: "Jaegeuk Kim" diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 49d775ecf2bf..98a443b26974 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -118,6 +118,15 @@ static ssize_t sb_status_show(struct f2fs_attr *a, return sprintf(buf, "%lx\n", sbi->s_flag); } +static ssize_t pending_discard_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + if (!SM_I(sbi)->dcc_info) + return -EINVAL; + return sprintf(buf, "%llu\n", (unsigned long long)atomic_read( + &SM_I(sbi)->dcc_info->discard_cmd_cnt)); +} + static ssize_t features_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -739,6 +748,7 @@ F2FS_GENERAL_RO_ATTR(unusable); F2FS_GENERAL_RO_ATTR(encoding); F2FS_GENERAL_RO_ATTR(mounted_time_sec); F2FS_GENERAL_RO_ATTR(main_blkaddr); +F2FS_GENERAL_RO_ATTR(pending_discard); #ifdef CONFIG_F2FS_STAT_FS F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_foreground_calls, cp_count); F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_background_calls, bg_cp_count); @@ -806,6 +816,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(main_blkaddr), ATTR_LIST(max_small_discards), ATTR_LIST(discard_granularity), + ATTR_LIST(pending_discard), ATTR_LIST(batched_trim_sections), ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), From 0c72ce1e913faf5e5366438d493bd01f56ddb778 Mon Sep 17 00:00:00 2001 From: Fengnan Chang Date: Wed, 10 Nov 2021 10:37:13 +0800 Subject: [PATCH 342/580] f2fs: compress: reduce one page array alloc and free when write compressed page Don't alloc new page pointers array to replace old, just use old, introduce valid_nr_cpages to indicate valid number of page pointers in array, try to reduce one page array alloc and free when write compress page. Signed-off-by: Fengnan Chang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 32 ++++++++++---------------------- fs/f2fs/data.c | 1 + fs/f2fs/f2fs.h | 1 + 3 files changed, 12 insertions(+), 22 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 9d5c9fc90c36..84a710135216 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -154,6 +154,7 @@ void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse) cc->rpages = NULL; cc->nr_rpages = 0; cc->nr_cpages = 0; + cc->valid_nr_cpages = 0; if (!reuse) cc->cluster_idx = NULL_CLUSTER; } @@ -590,7 +591,6 @@ static int f2fs_compress_pages(struct compress_ctx *cc) const struct f2fs_compress_ops *cops = f2fs_cops[fi->i_compress_algorithm]; unsigned int max_len, new_nr_cpages; - struct page **new_cpages; u32 chksum = 0; int i, ret; @@ -605,6 +605,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc) max_len = COMPRESS_HEADER_SIZE + cc->clen; cc->nr_cpages = DIV_ROUND_UP(max_len, PAGE_SIZE); + cc->valid_nr_cpages = cc->nr_cpages; cc->cpages = page_array_alloc(cc->inode, cc->nr_cpages); if (!cc->cpages) { @@ -655,13 +656,6 @@ static int f2fs_compress_pages(struct compress_ctx *cc) new_nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE); - /* Now we're going to cut unnecessary tail pages */ - new_cpages = page_array_alloc(cc->inode, new_nr_cpages); - if (!new_cpages) { - ret = -ENOMEM; - goto out_vunmap_cbuf; - } - /* zero out any unused part of the last page */ memset(&cc->cbuf->cdata[cc->clen], 0, (new_nr_cpages * PAGE_SIZE) - @@ -671,10 +665,8 @@ static int f2fs_compress_pages(struct compress_ctx *cc) vm_unmap_ram(cc->rbuf, cc->cluster_size); for (i = 0; i < cc->nr_cpages; i++) { - if (i < new_nr_cpages) { - new_cpages[i] = cc->cpages[i]; + if (i < new_nr_cpages) continue; - } f2fs_compress_free_page(cc->cpages[i]); cc->cpages[i] = NULL; } @@ -682,9 +674,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc) if (cops->destroy_compress_ctx) cops->destroy_compress_ctx(cc); - page_array_free(cc->inode, cc->cpages, cc->nr_cpages); - cc->cpages = new_cpages; - cc->nr_cpages = new_nr_cpages; + cc->valid_nr_cpages = new_nr_cpages; trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx, cc->clen, ret); @@ -1278,14 +1268,14 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, cic->magic = F2FS_COMPRESSED_PAGE_MAGIC; cic->inode = inode; - atomic_set(&cic->pending_pages, cc->nr_cpages); + atomic_set(&cic->pending_pages, cc->valid_nr_cpages); cic->rpages = page_array_alloc(cc->inode, cc->cluster_size); if (!cic->rpages) goto out_put_cic; cic->nr_rpages = cc->cluster_size; - for (i = 0; i < cc->nr_cpages; i++) { + for (i = 0; i < cc->valid_nr_cpages; i++) { f2fs_set_compressed_page(cc->cpages[i], inode, cc->rpages[i + 1]->index, cic); fio.compressed_page = cc->cpages[i]; @@ -1330,7 +1320,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, if (fio.compr_blocks && __is_valid_data_blkaddr(blkaddr)) fio.compr_blocks++; - if (i > cc->nr_cpages) { + if (i > cc->valid_nr_cpages) { if (__is_valid_data_blkaddr(blkaddr)) { f2fs_invalidate_blocks(sbi, blkaddr); f2fs_update_data_blkaddr(&dn, NEW_ADDR); @@ -1355,8 +1345,8 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, if (fio.compr_blocks) f2fs_i_compr_blocks_update(inode, fio.compr_blocks - 1, false); - f2fs_i_compr_blocks_update(inode, cc->nr_cpages, true); - add_compr_block_stat(inode, cc->nr_cpages); + f2fs_i_compr_blocks_update(inode, cc->valid_nr_cpages, true); + add_compr_block_stat(inode, cc->valid_nr_cpages); set_inode_flag(cc->inode, FI_APPEND_WRITE); if (cc->cluster_idx == 0) @@ -1394,9 +1384,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, else f2fs_unlock_op(sbi); out_free: - for (i = 0; i < cc->nr_cpages; i++) { - if (!cc->cpages[i]) - continue; + for (i = 0; i < cc->valid_nr_cpages; i++) { f2fs_compress_free_page(cc->cpages[i]); cc->cpages[i] = NULL; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 6c532027d5eb..40e71a9f06d3 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2979,6 +2979,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, .rpages = NULL, .nr_rpages = 0, .cpages = NULL, + .valid_nr_cpages = 0, .rbuf = NULL, .cbuf = NULL, .rlen = PAGE_SIZE * F2FS_I(inode)->i_cluster_size, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9a4913a1dbc6..59e0bb743d50 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1492,6 +1492,7 @@ struct compress_ctx { unsigned int nr_rpages; /* total page number in rpages */ struct page **cpages; /* pages store compressed data in cluster */ unsigned int nr_cpages; /* total page number in cpages */ + unsigned int valid_nr_cpages; /* valid page number in cpages */ void *rbuf; /* virtual mapped address on rpages */ struct compress_data *cbuf; /* virtual mapped address on cpages */ size_t rlen; /* valid data length in rbuf */ From c04cf7be124ba4f9e93758f823890eca3c2b1ba0 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 16 Jul 2021 09:39:13 -0500 Subject: [PATCH 343/580] f2fs: rework write preallocations f2fs_write_begin() assumes that all blocks were preallocated by default unless FI_NO_PREALLOC is explicitly set. This invites data corruption, as there are cases in which not all blocks are preallocated. Commit 47501f87c61a ("f2fs: preallocate DIO blocks when forcing buffered_io") fixed one case, but there are others remaining. Fix up this logic by replacing this flag with FI_PREALLOCATED_ALL, which only gets set if all blocks for the current write were preallocated. Also clean up f2fs_preallocate_blocks(), move it to file.c, and make it handle some of the logic that was previously in write_iter() directly. Signed-off-by: Eric Biggers Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 55 ++------------------ fs/f2fs/f2fs.h | 3 +- fs/f2fs/file.c | 133 +++++++++++++++++++++++++++++++------------------ 3 files changed, 89 insertions(+), 102 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 40e71a9f06d3..2fb95a74bbff 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1346,53 +1346,6 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) return 0; } -int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) -{ - struct inode *inode = file_inode(iocb->ki_filp); - struct f2fs_map_blocks map; - int flag; - int err = 0; - bool direct_io = iocb->ki_flags & IOCB_DIRECT; - - map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos); - map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from)); - if (map.m_len > map.m_lblk) - map.m_len -= map.m_lblk; - else - map.m_len = 0; - - map.m_next_pgofs = NULL; - map.m_next_extent = NULL; - map.m_seg_type = NO_CHECK_TYPE; - map.m_may_create = true; - - if (direct_io) { - map.m_seg_type = f2fs_rw_hint_to_seg_type(iocb->ki_hint); - flag = f2fs_force_buffered_io(inode, iocb, from) ? - F2FS_GET_BLOCK_PRE_AIO : - F2FS_GET_BLOCK_PRE_DIO; - goto map_blocks; - } - if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) { - err = f2fs_convert_inline_inode(inode); - if (err) - return err; - } - if (f2fs_has_inline_data(inode)) - return err; - - flag = F2FS_GET_BLOCK_PRE_AIO; - -map_blocks: - err = f2fs_map_blocks(inode, &map, 1, flag); - if (map.m_len > 0 && err == -ENOSPC) { - if (!direct_io) - set_inode_flag(inode, FI_NO_PREALLOC); - err = 0; - } - return err; -} - void f2fs_do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) { if (flag == F2FS_GET_BLOCK_PRE_AIO) { @@ -3334,12 +3287,10 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, int flag; /* - * we already allocated all the blocks, so we don't need to get - * the block addresses when there is no need to fill the page. + * If a whole page is being written and we already preallocated all the + * blocks, then there is no need to get a block address now. */ - if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE && - !is_inode_flag_set(inode, FI_NO_PREALLOC) && - !f2fs_verity_in_progress(inode)) + if (len == PAGE_SIZE && is_inode_flag_set(inode, FI_PREALLOCATED_ALL)) return 0; /* f2fs_lock_op avoids race between write CP and convert_inline_page */ diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 59e0bb743d50..8d3d1d2a3b06 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -714,7 +714,7 @@ enum { FI_INLINE_DOTS, /* indicate inline dot dentries */ FI_DO_DEFRAG, /* indicate defragment is running */ FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ - FI_NO_PREALLOC, /* indicate skipped preallocated blocks */ + FI_PREALLOCATED_ALL, /* all blocks for write were preallocated */ FI_HOT_DATA, /* indicate file is hot */ FI_EXTRA_ATTR, /* indicate file has extra attribute */ FI_PROJ_INHERIT, /* indicate file inherits projectid */ @@ -3649,7 +3649,6 @@ void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr); int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count); int f2fs_reserve_new_block(struct dnode_of_data *dn); int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index); -int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from); int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); int f2fs_mpage_readpages(struct address_space *mapping, struct list_head *pages, struct page *page, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 508d1a44bace..ab7153470e30 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4389,10 +4389,77 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) return ret; } +/* + * Preallocate blocks for a write request, if it is possible and helpful to do + * so. Returns a positive number if blocks may have been preallocated, 0 if no + * blocks were preallocated, or a negative errno value if something went + * seriously wrong. Also sets FI_PREALLOCATED_ALL on the inode if *all* the + * requested blocks (not just some of them) have been allocated. + */ +static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + const loff_t pos = iocb->ki_pos; + const size_t count = iov_iter_count(iter); + struct f2fs_map_blocks map = {}; + bool dio = (iocb->ki_flags & IOCB_DIRECT) && + !f2fs_force_buffered_io(inode, iocb, iter); + int flag; + int ret; + + /* If it will be an out-of-place direct write, don't bother. */ + if (dio && f2fs_lfs_mode(sbi)) + return 0; + + /* No-wait I/O can't allocate blocks. */ + if (iocb->ki_flags & IOCB_NOWAIT) + return 0; + + /* If it will be a short write, don't bother. */ + if (iov_iter_fault_in_readable(iter, count)) + return 0; + + if (f2fs_has_inline_data(inode)) { + /* If the data will fit inline, don't bother. */ + if (pos + count <= MAX_INLINE_DATA(inode)) + return 0; + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + } + + /* Do not preallocate blocks that will be written partially in 4KB. */ + map.m_lblk = F2FS_BLK_ALIGN(pos); + map.m_len = F2FS_BYTES_TO_BLK(pos + count); + if (map.m_len > map.m_lblk) + map.m_len -= map.m_lblk; + else + map.m_len = 0; + map.m_may_create = true; + if (dio) { + map.m_seg_type = f2fs_rw_hint_to_seg_type(inode->i_write_hint); + flag = F2FS_GET_BLOCK_PRE_DIO; + } else { + map.m_seg_type = NO_CHECK_TYPE; + flag = F2FS_GET_BLOCK_PRE_AIO; + } + + ret = f2fs_map_blocks(inode, &map, 1, flag); + /* -ENOSPC is only a fatal error if no blocks could be allocated. */ + if (ret < 0 && !(ret == -ENOSPC && map.m_len > 0)) + return ret; + if (ret == 0) + set_inode_flag(inode, FI_PREALLOCATED_ALL); + return map.m_len; +} + static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); + loff_t target_size; + int preallocated; ssize_t ret; if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) { @@ -4416,84 +4483,54 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (unlikely(IS_IMMUTABLE(inode))) { ret = -EPERM; - goto unlock; + goto out_unlock; } if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { ret = -EPERM; - goto unlock; + goto out_unlock; } ret = generic_write_checks(iocb, from); if (ret > 0) { - bool preallocated = false; - size_t target_size = 0; - int err; - - if (iov_iter_fault_in_readable(from, iov_iter_count(from))) - set_inode_flag(inode, FI_NO_PREALLOC); - - if ((iocb->ki_flags & IOCB_NOWAIT)) { + if (iocb->ki_flags & IOCB_NOWAIT) { if (!f2fs_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from)) || f2fs_has_inline_data(inode) || f2fs_force_buffered_io(inode, iocb, from)) { - clear_inode_flag(inode, FI_NO_PREALLOC); - inode_unlock(inode); ret = -EAGAIN; - goto out; + goto out_unlock; } - goto write; } - - if (is_inode_flag_set(inode, FI_NO_PREALLOC)) - goto write; - if (iocb->ki_flags & IOCB_DIRECT) { - /* - * Convert inline data for Direct I/O before entering - * f2fs_direct_IO(). - */ - err = f2fs_convert_inline_inode(inode); - if (err) - goto out_err; - /* - * If force_buffere_io() is true, we have to allocate - * blocks all the time, since f2fs_direct_IO will fall - * back to buffered IO. - */ - if (!f2fs_force_buffered_io(inode, iocb, from) && - f2fs_lfs_mode(F2FS_I_SB(inode))) - goto write; + ret = f2fs_convert_inline_inode(inode); + if (ret) + goto out_unlock; } - preallocated = true; + /* Possibly preallocate the blocks for the write. */ target_size = iocb->ki_pos + iov_iter_count(from); - - err = f2fs_preallocate_blocks(iocb, from); - if (err) { -out_err: - clear_inode_flag(inode, FI_NO_PREALLOC); - inode_unlock(inode); - ret = err; - goto out; + preallocated = f2fs_preallocate_blocks(iocb, from); + if (preallocated < 0) { + ret = preallocated; + goto out_unlock; } -write: - ret = __generic_file_write_iter(iocb, from); - clear_inode_flag(inode, FI_NO_PREALLOC); - /* if we couldn't write data, we should deallocate blocks. */ - if (preallocated && i_size_read(inode) < target_size) { + ret = __generic_file_write_iter(iocb, from); + + /* Don't leave any preallocated blocks around past i_size. */ + if (preallocated > 0 && i_size_read(inode) < target_size) { down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); down_write(&F2FS_I(inode)->i_mmap_sem); f2fs_truncate(inode); up_write(&F2FS_I(inode)->i_mmap_sem); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } + clear_inode_flag(inode, FI_PREALLOCATED_ALL); if (ret > 0) f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); } -unlock: +out_unlock: inode_unlock(inode); out: trace_f2fs_file_write_iter(inode, iocb->ki_pos, From 8f388cdbb5fa76707ddb218fd94c9756da20c17e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 16 Jul 2021 09:39:14 -0500 Subject: [PATCH 344/580] f2fs: reduce indentation in f2fs_file_write_iter() Replace 'if (ret > 0)' with 'if (ret <= 0) goto out_unlock;'. No change in behavior. Signed-off-by: Eric Biggers Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 70 ++++++++++++++++++++++++++------------------------ 1 file changed, 37 insertions(+), 33 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ab7153470e30..67c05e35595d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4492,44 +4492,48 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) } ret = generic_write_checks(iocb, from); - if (ret > 0) { - if (iocb->ki_flags & IOCB_NOWAIT) { - if (!f2fs_overwrite_io(inode, iocb->ki_pos, - iov_iter_count(from)) || + if (ret <= 0) + goto out_unlock; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!f2fs_overwrite_io(inode, iocb->ki_pos, + iov_iter_count(from)) || f2fs_has_inline_data(inode) || f2fs_force_buffered_io(inode, iocb, from)) { - ret = -EAGAIN; - goto out_unlock; - } - } - if (iocb->ki_flags & IOCB_DIRECT) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - goto out_unlock; - } - /* Possibly preallocate the blocks for the write. */ - target_size = iocb->ki_pos + iov_iter_count(from); - preallocated = f2fs_preallocate_blocks(iocb, from); - if (preallocated < 0) { - ret = preallocated; + ret = -EAGAIN; goto out_unlock; } - - ret = __generic_file_write_iter(iocb, from); - - /* Don't leave any preallocated blocks around past i_size. */ - if (preallocated > 0 && i_size_read(inode) < target_size) { - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); - f2fs_truncate(inode); - up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - } - clear_inode_flag(inode, FI_PREALLOCATED_ALL); - - if (ret > 0) - f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); } + + if (iocb->ki_flags & IOCB_DIRECT) { + ret = f2fs_convert_inline_inode(inode); + if (ret) + goto out_unlock; + } + /* Possibly preallocate the blocks for the write. */ + target_size = iocb->ki_pos + iov_iter_count(from); + preallocated = f2fs_preallocate_blocks(iocb, from); + if (preallocated < 0) { + ret = preallocated; + goto out_unlock; + } + + ret = __generic_file_write_iter(iocb, from); + + /* Don't leave any preallocated blocks around past i_size. */ + if (preallocated > 0 && i_size_read(inode) < target_size) { + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_truncate(inode); + up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + } + + clear_inode_flag(inode, FI_PREALLOCATED_ALL); + + if (ret > 0) + f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); + out_unlock: inode_unlock(inode); out: From cefdc36110fdcbd2adab1a03570795a544fc8fab Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 12 Nov 2021 14:31:16 -0800 Subject: [PATCH 345/580] f2fs: do not expose unwritten blocks to user by DIO DIO preallocates physical blocks before writing data, but if an error occurrs or power-cut happens, we can see block contents from the disk. This patch tries to fix it by 1) turning to buffered writes for DIO into holes, 2) truncating unwritten blocks from error or power-cut. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 5 ++++- fs/f2fs/f2fs.h | 5 +++++ fs/f2fs/file.c | 27 ++++++++++++++++++--------- fs/f2fs/inode.c | 8 ++++++++ 4 files changed, 35 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 2fb95a74bbff..f49bc9d47b62 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1505,8 +1505,11 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, flag != F2FS_GET_BLOCK_DIO); err = __allocate_data_block(&dn, map->m_seg_type); - if (!err) + if (!err) { + if (flag == F2FS_GET_BLOCK_PRE_DIO) + file_need_truncate(inode); set_inode_flag(inode, FI_APPEND_WRITE); + } } if (err) goto sync_out; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8d3d1d2a3b06..07b74f301a18 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -653,6 +653,7 @@ enum { #define FADVISE_KEEP_SIZE_BIT 0x10 #define FADVISE_HOT_BIT 0x20 #define FADVISE_VERITY_BIT 0x40 +#define FADVISE_TRUNC_BIT 0x80 #define FADVISE_MODIFIABLE_BITS (FADVISE_COLD_BIT | FADVISE_HOT_BIT) @@ -680,6 +681,10 @@ enum { #define file_is_verity(inode) is_file(inode, FADVISE_VERITY_BIT) #define file_set_verity(inode) set_file(inode, FADVISE_VERITY_BIT) +#define file_should_truncate(inode) is_file(inode, FADVISE_TRUNC_BIT) +#define file_need_truncate(inode) set_file(inode, FADVISE_TRUNC_BIT) +#define file_dont_truncate(inode) clear_file(inode, FADVISE_TRUNC_BIT) + #define DEF_DIR_LEVEL 0 enum { diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 67c05e35595d..e2123c92096c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1705,6 +1705,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset, map.m_seg_type = CURSEG_COLD_DATA_PINNED; err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); + file_dont_truncate(inode); up_write(&sbi->pin_sem); @@ -4411,6 +4412,13 @@ static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter) /* If it will be an out-of-place direct write, don't bother. */ if (dio && f2fs_lfs_mode(sbi)) return 0; + /* + * Don't preallocate holes aligned to DIO_SKIP_HOLES which turns into + * buffered IO, if DIO meets any holes. + */ + if (dio && i_size_read(inode) && + (F2FS_BYTES_TO_BLK(pos) < F2FS_BLK_ALIGN(i_size_read(inode)))) + return 0; /* No-wait I/O can't allocate blocks. */ if (iocb->ki_flags & IOCB_NOWAIT) @@ -4446,8 +4454,8 @@ static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter) } ret = f2fs_map_blocks(inode, &map, 1, flag); - /* -ENOSPC is only a fatal error if no blocks could be allocated. */ - if (ret < 0 && !(ret == -ENOSPC && map.m_len > 0)) + /* -ENOSPC|-EDQUOT are fine to report the number of allocated blocks. */ + if (ret < 0 && !((ret == -ENOSPC || ret == -EDQUOT) && map.m_len > 0)) return ret; if (ret == 0) set_inode_flag(inode, FI_PREALLOCATED_ALL); @@ -4513,20 +4521,21 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) /* Possibly preallocate the blocks for the write. */ target_size = iocb->ki_pos + iov_iter_count(from); preallocated = f2fs_preallocate_blocks(iocb, from); - if (preallocated < 0) { + if (preallocated < 0) ret = preallocated; - goto out_unlock; - } - - ret = __generic_file_write_iter(iocb, from); + else + ret = __generic_file_write_iter(iocb, from); /* Don't leave any preallocated blocks around past i_size. */ - if (preallocated > 0 && i_size_read(inode) < target_size) { + if (preallocated && i_size_read(inode) < target_size) { down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); down_write(&F2FS_I(inode)->i_mmap_sem); - f2fs_truncate(inode); + if (!f2fs_truncate(inode)) + file_dont_truncate(inode); up_write(&F2FS_I(inode)->i_mmap_sem); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + } else { + file_dont_truncate(inode); } clear_inode_flag(inode, FI_PREALLOCATED_ALL); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index b79b714431af..a6fa00e4a207 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -544,6 +544,14 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) goto bad_inode; } f2fs_set_inode_flags(inode); + + if (file_should_truncate(inode)) { + ret = f2fs_truncate(inode); + if (ret) + goto bad_inode; + file_dont_truncate(inode); + } + unlock_new_inode(inode); trace_f2fs_iget(inode); return inode; From 751bf9e482c35c526bff79f7c77b7adb622629d2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 16 Jul 2021 09:39:15 -0500 Subject: [PATCH 346/580] f2fs: fix the f2fs_file_write_iter tracepoint Pass in the original position and count rather than the position and count that were updated by the write. Also use the correct types for all arguments, in particular the file offset which was being truncated to 32 bits on 32-bit platforms. Signed-off-by: Eric Biggers Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 +++-- include/trace/events/f2fs.h | 12 ++++++------ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index e2123c92096c..c83dbcb95b0f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4466,6 +4466,8 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); + const loff_t orig_pos = iocb->ki_pos; + const size_t orig_count = iov_iter_count(from); loff_t target_size; int preallocated; ssize_t ret; @@ -4546,8 +4548,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) out_unlock: inode_unlock(inode); out: - trace_f2fs_file_write_iter(inode, iocb->ki_pos, - iov_iter_count(from), ret); + trace_f2fs_file_write_iter(inode, orig_pos, orig_count, ret); if (ret > 0) ret = generic_write_sync(iocb, ret); return ret; diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 04f8b5fd567a..2f0568950009 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -549,17 +549,17 @@ TRACE_EVENT(f2fs_truncate_partial_nodes, TRACE_EVENT(f2fs_file_write_iter, - TP_PROTO(struct inode *inode, unsigned long offset, - unsigned long length, int ret), + TP_PROTO(struct inode *inode, loff_t offset, size_t length, + ssize_t ret), TP_ARGS(inode, offset, length, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) - __field(unsigned long, offset) - __field(unsigned long, length) - __field(int, ret) + __field(loff_t, offset) + __field(size_t, length) + __field(ssize_t, ret) ), TP_fast_assign( @@ -571,7 +571,7 @@ TRACE_EVENT(f2fs_file_write_iter, ), TP_printk("dev = (%d,%d), ino = %lu, " - "offset = %lu, length = %lu, written(err) = %d", + "offset = %lld, length = %zu, written(err) = %zd", show_dev_ino(__entry), __entry->offset, __entry->length, From c5d822dd106561d93a97626272753512a9db5419 Mon Sep 17 00:00:00 2001 From: Fengnan Chang Date: Fri, 26 Nov 2021 18:19:19 +0800 Subject: [PATCH 347/580] f2fs: fix remove page failed in invalidate compress pages Since compress inode not a regular file, generic_error_remove_page in f2fs_invalidate_compress_pages will always be failed, set compress inode as a regular file to fix it. Fixes: 6ce19aff0b8c ("f2fs: compress: add compress_inode to cache compressed blocks") Signed-off-by: Fengnan Chang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index a6fa00e4a207..1222cc687e5d 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -516,6 +516,11 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) } else if (ino == F2FS_COMPRESS_INO(sbi)) { #ifdef CONFIG_F2FS_FS_COMPRESSION inode->i_mapping->a_ops = &f2fs_compress_aops; + /* + * generic_error_remove_page only truncates pages of regular + * inode + */ + inode->i_mode |= S_IFREG; #endif mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS | __GFP_HIGHMEM | __GFP_MOVABLE); From 4f0c7acab801e31af56c8126da7ebde77217fafd Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 4 Dec 2021 09:55:35 -0800 Subject: [PATCH 348/580] f2fs: avoid duplicate call of mark_inode_dirty Let's check the condition first before set|clear bit. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 07b74f301a18..4a9b554bd4aa 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3127,12 +3127,16 @@ static inline int is_file(struct inode *inode, int type) static inline void set_file(struct inode *inode, int type) { + if (is_file(inode, type)) + return; F2FS_I(inode)->i_advise |= type; f2fs_mark_inode_dirty_sync(inode, true); } static inline void clear_file(struct inode *inode, int type) { + if (!is_file(inode, type)) + return; F2FS_I(inode)->i_advise &= ~type; f2fs_mark_inode_dirty_sync(inode, true); } From 738d483a6ff4e66a051ac6dadc0a386ccc7008ea Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 6 Dec 2021 22:44:19 +0800 Subject: [PATCH 349/580] f2fs: fix to do sanity check on inode type during garbage collection As report by Wenqing Liu in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215231 - Overview kernel NULL pointer dereference triggered in folio_mark_dirty() when mount and operate on a crafted f2fs image - Reproduce tested on kernel 5.16-rc3, 5.15.X under root 1. mkdir mnt 2. mount -t f2fs tmp1.img mnt 3. touch tmp 4. cp tmp mnt F2FS-fs (loop0): sanity_check_inode: inode (ino=49) extent info [5942, 4294180864, 4] is incorrect, run fsck to fix F2FS-fs (loop0): f2fs_check_nid_range: out-of-range nid=31340049, run fsck to fix. BUG: kernel NULL pointer dereference, address: 0000000000000000 folio_mark_dirty+0x33/0x50 move_data_page+0x2dd/0x460 [f2fs] do_garbage_collect+0xc18/0x16a0 [f2fs] f2fs_gc+0x1d3/0xd90 [f2fs] f2fs_balance_fs+0x13a/0x570 [f2fs] f2fs_create+0x285/0x840 [f2fs] path_openat+0xe6d/0x1040 do_filp_open+0xc5/0x140 do_sys_openat2+0x23a/0x310 do_sys_open+0x57/0x80 The root cause is for special file: e.g. character, block, fifo or socket file, f2fs doesn't assign address space operations pointer array for mapping->a_ops field, so, in a fuzzed image, SSA table indicates a data block belong to special file, when f2fs tries to migrate that block, it causes NULL pointer access once move_data_page() calls a_ops->set_dirty_page(). Cc: stable@vger.kernel.org Reported-by: Wenqing Liu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index a946ce0ead34..e0bdc4361a9b 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1457,7 +1457,8 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (phase == 3) { inode = f2fs_iget(sb, dni.ino); - if (IS_ERR(inode) || is_bad_inode(inode)) + if (IS_ERR(inode) || is_bad_inode(inode) || + special_file(inode->i_mode)) continue; if (!down_write_trylock( From 72a255389e1affe9ad67ab01e6b4b859a250bc62 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 6 Dec 2021 22:44:20 +0800 Subject: [PATCH 350/580] f2fs: fix to avoid panic in is_alive() if metadata is inconsistent As report by Wenqing Liu in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215231 If we enable CONFIG_F2FS_CHECK_FS config, and with fuzzed image attached in above link, we will encounter panic when executing below script: 1. mkdir mnt 2. mount -t f2fs tmp1.img mnt 3. touch tmp F2FS-fs (loop11): mismatched blkaddr 5765 (source_blkaddr 1) in seg 3 kernel BUG at fs/f2fs/gc.c:1042! do_garbage_collect+0x90f/0xa80 [f2fs] f2fs_gc+0x294/0x12a0 [f2fs] f2fs_balance_fs+0x2c5/0x7d0 [f2fs] f2fs_create+0x239/0xd90 [f2fs] lookup_open+0x45e/0xa90 open_last_lookups+0x203/0x670 path_openat+0xae/0x490 do_filp_open+0xbc/0x160 do_sys_openat2+0x2f1/0x500 do_sys_open+0x5e/0xa0 __x64_sys_openat+0x28/0x40 Previously, f2fs tries to catch data inconcistency exception in between SSA and SIT table during GC, however once the exception is caught, it will call f2fs_bug_on to hang kernel, it's not needed, instead, let's set SBI_NEED_FSCK flag and skip migrating current block. Fixes: bbf9f7d90f21 ("f2fs: Fix indefinite loop in f2fs_gc()") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index e0bdc4361a9b..3e64b234df21 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1039,7 +1039,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (!test_and_set_bit(segno, SIT_I(sbi)->invalid_segmap)) { f2fs_err(sbi, "mismatched blkaddr %u (source_blkaddr %u) in seg %u", blkaddr, source_blkaddr, segno); - f2fs_bug_on(sbi, 1); + set_sbi_flag(sbi, SBI_NEED_FSCK); } } #endif From 26d785d263d3fa00dda44f2363a7d9bdbc8e1f32 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 6 Dec 2021 22:44:21 +0800 Subject: [PATCH 351/580] f2fs: fix to do sanity check in is_alive() In fuzzed image, SSA table may indicate that a data block belongs to invalid node, which node ID is out-of-range (0, 1, 2 or max_nid), in order to avoid migrating inconsistent data in such corrupted image, let's do sanity check anyway before data block migration. Cc: stable@vger.kernel.org Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 3e64b234df21..b538cbcba351 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1026,6 +1026,9 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, set_sbi_flag(sbi, SBI_NEED_FSCK); } + if (f2fs_check_nid_range(sbi, dni->ino)) + return false; + *nofs = ofs_of_node(node_page); source_blkaddr = data_blkaddr(NULL, node_page, ofs_in_node); f2fs_put_page(node_page, 1); From 8cc3d85aad23d87580e0ad9fe4ec0589d03e168b Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Wed, 8 Dec 2021 16:41:51 -0800 Subject: [PATCH 352/580] f2fs: add gc_urgent_high_remaining sysfs node Added a new sysfs node called gc_urgent_high_remaining. The user can set the trial count limit for GC urgent high mode with this value. If GC thread gets to the limit, the mode will turn back to GC normal mode. By default, the value is zero, which means there is no limit like before. Signed-off-by: Daeho Jeong Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 7 +++++++ fs/f2fs/f2fs.h | 3 +++ fs/f2fs/gc.c | 12 ++++++++++++ fs/f2fs/super.c | 1 + fs/f2fs/sysfs.c | 11 +++++++++++ 5 files changed, 34 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index ad1b2c65333c..d033fa7de9f2 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -507,3 +507,10 @@ Description: With "mode=fragment:block" mount options, we can scatter block allo f2fs will allocate 1.. blocks in a chunk and make a hole in the length of 1.. by turns. This value can be set between 1..512 and the default value is 4. + +What: /sys/fs/f2fs//gc_urgent_high_remaining +Date: December 2021 +Contact: "Daeho Jeong" +Description: You can set the trial count limit for GC urgent high mode with this value. + If GC thread gets to the limit, the mode will turn back to GC normal mode. + By default, the value is zero, which means there is no limit like before. diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4a9b554bd4aa..3560885cba0a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1689,6 +1689,9 @@ struct f2fs_sb_info { unsigned int cur_victim_sec; /* current victim section num */ unsigned int gc_mode; /* current GC state */ unsigned int next_victim_seg[2]; /* next segment in victim section */ + spinlock_t gc_urgent_high_lock; + bool gc_urgent_high_limited; /* indicates having limited trial count */ + unsigned int gc_urgent_high_remaining; /* remaining trial count for GC_URGENT_HIGH */ /* for skip statistic */ unsigned int atomic_files; /* # of opened atomic file */ diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index b538cbcba351..7fbe46477a5a 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -92,6 +92,18 @@ static int gc_thread_func(void *data) * So, I'd like to wait some time to collect dirty segments. */ if (sbi->gc_mode == GC_URGENT_HIGH) { + spin_lock(&sbi->gc_urgent_high_lock); + if (sbi->gc_urgent_high_limited) { + if (!sbi->gc_urgent_high_remaining) { + sbi->gc_urgent_high_limited = false; + spin_unlock(&sbi->gc_urgent_high_lock); + sbi->gc_mode = GC_NORMAL; + continue; + } + sbi->gc_urgent_high_remaining--; + } + spin_unlock(&sbi->gc_urgent_high_lock); + wait_ms = gc_th->urgent_sleep_time; down_write(&sbi->gc_lock); goto do_gc; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ae3ec331a657..25408b3671e0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3516,6 +3516,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->migration_granularity = sbi->segs_per_sec; sbi->max_fragment_chunk = DEF_FRAGMENT_SIZE; sbi->max_fragment_hole = DEF_FRAGMENT_SIZE; + spin_lock_init(&sbi->gc_urgent_high_lock); sbi->dir_level = DEF_DIR_LEVEL; sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 98a443b26974..adf1cf503bbc 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -490,6 +490,15 @@ static ssize_t __sbi_store(struct f2fs_attr *a, return count; } + if (!strcmp(a->attr.name, "gc_urgent_high_remaining")) { + spin_lock(&sbi->gc_urgent_high_lock); + sbi->gc_urgent_high_limited = t == 0 ? false : true; + sbi->gc_urgent_high_remaining = t; + spin_unlock(&sbi->gc_urgent_high_lock); + + return count; + } + #ifdef CONFIG_F2FS_IOSTAT if (!strcmp(a->attr.name, "iostat_enable")) { sbi->iostat_enable = !!t; @@ -737,6 +746,7 @@ F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); #endif F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, data_io_flag, data_io_flag); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent_high_remaining, gc_urgent_high_remaining); F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio); F2FS_GENERAL_RO_ATTR(dirty_segments); F2FS_GENERAL_RO_ATTR(free_segments); @@ -849,6 +859,7 @@ static struct attribute *f2fs_attrs[] = { #endif ATTR_LIST(data_io_flag), ATTR_LIST(node_io_flag), + ATTR_LIST(gc_urgent_high_remaining), ATTR_LIST(ckpt_thread_ioprio), ATTR_LIST(dirty_segments), ATTR_LIST(free_segments), From 2bdad803277fdb496098688b8e732a96c4027596 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 9 Dec 2021 10:25:43 -0800 Subject: [PATCH 353/580] f2fs: avoid EINVAL by SBI_NEED_FSCK when pinning a file Android OTA failed due to SBI_NEED_FSCK flag when pinning the file. Let's avoid it since we can do in-place-updates. Signed-off-by: Jaegeuk Kim Change-Id: I3fd33c984417c10b38e23de6cec017b03d588945 --- fs/f2fs/data.c | 7 +++++-- fs/f2fs/file.c | 10 +++++----- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f49bc9d47b62..1312c8be4cf3 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2565,6 +2565,11 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + /* The below cases were checked when setting it. */ + if (f2fs_is_pinned_file(inode)) + return false; + if (fio && is_sbi_flag_set(sbi, SBI_NEED_FSCK)) + return true; if (f2fs_lfs_mode(sbi)) return true; if (S_ISDIR(inode->i_mode)) @@ -2573,8 +2578,6 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) return true; if (f2fs_is_atomic_file(inode)) return true; - if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) - return true; /* swap file is migrating in aligned write mode */ if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c83dbcb95b0f..bb9fe7de0800 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -3293,17 +3293,17 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) inode_lock(inode); - if (f2fs_should_update_outplace(inode, NULL)) { - ret = -EINVAL; - goto out; - } - if (!pin) { clear_inode_flag(inode, FI_PIN_FILE); f2fs_i_gc_failures_write(inode, 0); goto done; } + if (f2fs_should_update_outplace(inode, NULL)) { + ret = -EINVAL; + goto out; + } + if (f2fs_pin_file_control(inode, false)) { ret = -EAGAIN; goto out; From 10e462b45c43024c5528b0e2d404e5f124b9947d Mon Sep 17 00:00:00 2001 From: Hyeong-Jun Kim Date: Fri, 10 Dec 2021 13:30:12 +0900 Subject: [PATCH 354/580] f2fs: compress: fix potential deadlock of compress file There is a potential deadlock between writeback process and a process performing write_begin() or write_cache_pages() while trying to write same compress file, but not compressable, as below: [Process A] - doing checkpoint [Process B] [Process C] f2fs_write_cache_pages() - lock_page() [all pages in cluster, 0-31] - f2fs_write_multi_pages() - f2fs_write_raw_pages() - f2fs_write_single_data_page() - f2fs_do_write_data_page() - return -EAGAIN [f2fs_trylock_op() failed] - unlock_page(page) [e.g., page 0] - generic_perform_write() - f2fs_write_begin() - f2fs_prepare_compress_overwrite() - prepare_compress_overwrite() - lock_page() [e.g., page 0] - lock_page() [e.g., page 1] - lock_page(page) [e.g., page 0] Since there is no compress process, it is no longer necessary to hold locks on every pages in cluster within f2fs_write_raw_pages(). This patch changes f2fs_write_raw_pages() to release all locks first and then perform write same as the non-compress file in f2fs_write_cache_pages(). Fixes: 4c8ff7095bef ("f2fs: support data compression") Signed-off-by: Hyeong-Jun Kim Signed-off-by: Sungjong Seo Signed-off-by: Youngjin Gil Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 50 ++++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 28 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 84a710135216..93663529c55f 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1426,25 +1426,38 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc, enum iostat_type io_type) { struct address_space *mapping = cc->inode->i_mapping; - int _submitted, compr_blocks, ret; - int i = -1, err = 0; + int _submitted, compr_blocks, ret, i; compr_blocks = f2fs_compressed_blocks(cc); - if (compr_blocks < 0) { - err = compr_blocks; - goto out_err; + + for (i = 0; i < cc->cluster_size; i++) { + if (!cc->rpages[i]) + continue; + + redirty_page_for_writepage(wbc, cc->rpages[i]); + unlock_page(cc->rpages[i]); } + if (compr_blocks < 0) + return compr_blocks; + for (i = 0; i < cc->cluster_size; i++) { if (!cc->rpages[i]) continue; retry_write: + lock_page(cc->rpages[i]); + if (cc->rpages[i]->mapping != mapping) { +continue_unlock: unlock_page(cc->rpages[i]); continue; } - BUG_ON(!PageLocked(cc->rpages[i])); + if (!PageDirty(cc->rpages[i])) + goto continue_unlock; + + if (!clear_page_dirty_for_io(cc->rpages[i])) + goto continue_unlock; ret = f2fs_write_single_data_page(cc->rpages[i], &_submitted, NULL, NULL, wbc, io_type, @@ -1459,26 +1472,15 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc, * avoid deadlock caused by cluster update race * from foreground operation. */ - if (IS_NOQUOTA(cc->inode)) { - err = 0; - goto out_err; - } + if (IS_NOQUOTA(cc->inode)) + return 0; ret = 0; cond_resched(); congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); - lock_page(cc->rpages[i]); - - if (!PageDirty(cc->rpages[i])) { - unlock_page(cc->rpages[i]); - continue; - } - - clear_page_dirty_for_io(cc->rpages[i]); goto retry_write; } - err = ret; - goto out_err; + return ret; } *submitted += _submitted; @@ -1487,14 +1489,6 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc, f2fs_balance_fs(F2FS_M_SB(mapping), true); return 0; -out_err: - for (++i; i < cc->cluster_size; i++) { - if (!cc->rpages[i]) - continue; - redirty_page_for_writepage(wbc, cc->rpages[i]); - unlock_page(cc->rpages[i]); - } - return err; } int f2fs_write_multi_pages(struct compress_ctx *cc, From 3b95f4c2491881bf82b562d3b901581f8a707e8b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 13 Dec 2021 13:28:40 -0800 Subject: [PATCH 355/580] f2fs: avoid down_write on nat_tree_lock during checkpoint Let's cache nat entry if there's no lock contention only. Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index dae0b055fa91..0c4cc2e09bc9 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -430,6 +430,10 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *new, *e; + /* Let's mitigate lock contention of nat_tree_lock during checkpoint */ + if (rwsem_is_locked(&sbi->cp_global_sem)) + return; + new = __alloc_nat_entry(sbi, nid, false); if (!new) return; From 17be53c6781cb5cfb0f84234b0967ffd7b6c3d62 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 13 Dec 2021 14:16:32 -0800 Subject: [PATCH 356/580] f2fs: do not bother checkpoint by f2fs_get_node_info This patch tries to mitigate lock contention between f2fs_write_checkpoint and f2fs_get_node_info along with nat_tree_lock. The idea is, if checkpoint is currently running, other threads that try to grab nat_tree_lock would be better to wait for checkpoint. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/compress.c | 2 +- fs/f2fs/data.c | 8 ++++---- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 2 +- fs/f2fs/gc.c | 6 +++--- fs/f2fs/inline.c | 4 ++-- fs/f2fs/inode.c | 2 +- fs/f2fs/node.c | 19 ++++++++++--------- fs/f2fs/recovery.c | 2 +- fs/f2fs/segment.c | 2 +- 11 files changed, 26 insertions(+), 25 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 48beee4ceb3b..acccfe47fb7c 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -664,7 +664,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) /* truncate all the data during iput */ iput(inode); - err = f2fs_get_node_info(sbi, ino, &ni); + err = f2fs_get_node_info(sbi, ino, &ni, false); if (err) goto err_out; diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 93663529c55f..f0a7aba740d6 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1256,7 +1256,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, psize = (loff_t)(cc->rpages[last_index]->index + 1) << PAGE_SHIFT; - err = f2fs_get_node_info(fio.sbi, dn.nid, &ni); + err = f2fs_get_node_info(fio.sbi, dn.nid, &ni, false); if (err) goto out_put_dnode; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 1312c8be4cf3..c17b8a82fef6 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1316,7 +1316,7 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; - err = f2fs_get_node_info(sbi, dn->nid, &ni); + err = f2fs_get_node_info(sbi, dn->nid, &ni, false); if (err) return err; @@ -1767,7 +1767,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, if (!page) return -ENOMEM; - err = f2fs_get_node_info(sbi, inode->i_ino, &ni); + err = f2fs_get_node_info(sbi, inode->i_ino, &ni, false); if (err) { f2fs_put_page(page, 1); return err; @@ -1799,7 +1799,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, if (!page) return -ENOMEM; - err = f2fs_get_node_info(sbi, xnid, &ni); + err = f2fs_get_node_info(sbi, xnid, &ni, false); if (err) { f2fs_put_page(page, 1); return err; @@ -2689,7 +2689,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) fio->need_lock = LOCK_REQ; } - err = f2fs_get_node_info(fio->sbi, dn.nid, &ni); + err = f2fs_get_node_info(fio->sbi, dn.nid, &ni, false); if (err) goto out_writepage; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3560885cba0a..8b8a79615ff0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3455,7 +3455,7 @@ int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino); int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, - struct node_info *ni); + struct node_info *ni, bool checkpoint_context); pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs); int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode); int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index bb9fe7de0800..feda1d60469b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1251,7 +1251,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, if (ret) return ret; - ret = f2fs_get_node_info(sbi, dn.nid, &ni); + ret = f2fs_get_node_info(sbi, dn.nid, &ni, false); if (ret) { f2fs_put_dnode(&dn); return ret; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 7fbe46477a5a..a6accec60d04 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -959,7 +959,7 @@ static int gc_node_segment(struct f2fs_sb_info *sbi, continue; } - if (f2fs_get_node_info(sbi, nid, &ni)) { + if (f2fs_get_node_info(sbi, nid, &ni, false)) { f2fs_put_page(node_page, 1); continue; } @@ -1027,7 +1027,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (IS_ERR(node_page)) return false; - if (f2fs_get_node_info(sbi, nid, dni)) { + if (f2fs_get_node_info(sbi, nid, dni, false)) { f2fs_put_page(node_page, 1); return false; } @@ -1221,7 +1221,7 @@ static int move_data_block(struct inode *inode, block_t bidx, f2fs_wait_on_block_writeback(inode, dn.data_blkaddr); - err = f2fs_get_node_info(fio.sbi, dn.nid, &ni); + err = f2fs_get_node_info(fio.sbi, dn.nid, &ni, false); if (err) goto put_out; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 9ec16011ce05..9f7a27e149bf 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -130,7 +130,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) if (err) return err; - err = f2fs_get_node_info(fio.sbi, dn->nid, &ni); + err = f2fs_get_node_info(fio.sbi, dn->nid, &ni, false); if (err) { f2fs_truncate_data_blocks_range(dn, 1); f2fs_put_dnode(dn); @@ -785,7 +785,7 @@ int f2fs_inline_data_fiemap(struct inode *inode, ilen = start + len; ilen -= start; - err = f2fs_get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni); + err = f2fs_get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni, false); if (err) goto out; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 1222cc687e5d..32c7b525b641 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -881,7 +881,7 @@ void f2fs_handle_failed_inode(struct inode *inode) * so we can prevent losing this orphan when encoutering checkpoint * and following suddenly power-off. */ - err = f2fs_get_node_info(sbi, inode->i_ino, &ni); + err = f2fs_get_node_info(sbi, inode->i_ino, &ni, false); if (err) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "May loss orphan inode, run fsck to fix."); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 0c4cc2e09bc9..62c8c246890f 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -543,7 +543,7 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) } int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, - struct node_info *ni) + struct node_info *ni, bool checkpoint_context) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -576,9 +576,10 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, * nat_tree_lock. Therefore, we should retry, if we failed to grab here * while not bothering checkpoint. */ - if (!rwsem_is_locked(&sbi->cp_global_sem)) { + if (!rwsem_is_locked(&sbi->cp_global_sem) || checkpoint_context) { down_read(&curseg->journal_rwsem); - } else if (!down_read_trylock(&curseg->journal_rwsem)) { + } else if (rwsem_is_contended(&nm_i->nat_tree_lock) || + !down_read_trylock(&curseg->journal_rwsem)) { up_read(&nm_i->nat_tree_lock); goto retry; } @@ -889,7 +890,7 @@ static int truncate_node(struct dnode_of_data *dn) int err; pgoff_t index; - err = f2fs_get_node_info(sbi, dn->nid, &ni); + err = f2fs_get_node_info(sbi, dn->nid, &ni, false); if (err) return err; @@ -1288,7 +1289,7 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs) goto fail; #ifdef CONFIG_F2FS_CHECK_FS - err = f2fs_get_node_info(sbi, dn->nid, &new_ni); + err = f2fs_get_node_info(sbi, dn->nid, &new_ni, false); if (err) { dec_valid_node_count(sbi, dn->inode, !ofs); goto fail; @@ -1350,7 +1351,7 @@ static int read_node_page(struct page *page, int op_flags) return LOCKED_PAGE; } - err = f2fs_get_node_info(sbi, page->index, &ni); + err = f2fs_get_node_info(sbi, page->index, &ni, false); if (err) return err; @@ -1604,7 +1605,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, nid = nid_of_node(page); f2fs_bug_on(sbi, page->index != nid); - if (f2fs_get_node_info(sbi, nid, &ni)) + if (f2fs_get_node_info(sbi, nid, &ni, !do_balance)) goto redirty_out; if (wbc->for_reclaim) { @@ -2705,7 +2706,7 @@ int f2fs_recover_xattr_data(struct inode *inode, struct page *page) goto recover_xnid; /* 1: invalidate the previous xattr nid */ - err = f2fs_get_node_info(sbi, prev_xnid, &ni); + err = f2fs_get_node_info(sbi, prev_xnid, &ni, false); if (err) return err; @@ -2745,7 +2746,7 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) struct page *ipage; int err; - err = f2fs_get_node_info(sbi, ino, &old_ni); + err = f2fs_get_node_info(sbi, ino, &old_ni, false); if (err) return err; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 656770e51a8b..f6d12872fde3 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -595,7 +595,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, f2fs_wait_on_page_writeback(dn.node_page, NODE, true, true); - err = f2fs_get_node_info(sbi, dn.nid, &ni); + err = f2fs_get_node_info(sbi, dn.nid, &ni, false); if (err) goto err; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 56df37331870..7564fa290647 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -254,7 +254,7 @@ static int __revoke_inmem_pages(struct inode *inode, goto next; } - err = f2fs_get_node_info(sbi, dn.nid, &ni); + err = f2fs_get_node_info(sbi, dn.nid, &ni, false); if (err) { f2fs_put_dnode(&dn); return err; From 164b9451b4dfbe79b697477f11a0322d42523a18 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 12 Dec 2021 17:16:30 +0800 Subject: [PATCH 357/580] f2fs: fix to do sanity check on last xattr entry in __f2fs_setxattr() As Wenqing Liu reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215235 - Overview page fault in f2fs_setxattr() when mount and operate on corrupted image - Reproduce tested on kernel 5.16-rc3, 5.15.X under root 1. unzip tmp7.zip 2. ./single.sh f2fs 7 Sometimes need to run the script several times - Kernel dump loop0: detected capacity change from 0 to 131072 F2FS-fs (loop0): Found nat_bits in checkpoint F2FS-fs (loop0): Mounted with checkpoint version = 7548c2ee BUG: unable to handle page fault for address: ffffe47bc7123f48 RIP: 0010:kfree+0x66/0x320 Call Trace: __f2fs_setxattr+0x2aa/0xc00 [f2fs] f2fs_setxattr+0xfa/0x480 [f2fs] __f2fs_set_acl+0x19b/0x330 [f2fs] __vfs_removexattr+0x52/0x70 __vfs_removexattr_locked+0xb1/0x140 vfs_removexattr+0x56/0x100 removexattr+0x57/0x80 path_removexattr+0xa3/0xc0 __x64_sys_removexattr+0x17/0x20 do_syscall_64+0x37/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae The root cause is in __f2fs_setxattr(), we missed to do sanity check on last xattr entry, result in out-of-bound memory access during updating inconsistent xattr data of target inode. After the fix, it can detect such xattr inconsistency as below: F2FS-fs (loop11): inode (7) has invalid last xattr entry, entry_size: 60676 F2FS-fs (loop11): inode (8) has corrupted xattr F2FS-fs (loop11): inode (8) has corrupted xattr F2FS-fs (loop11): inode (8) has invalid last xattr entry, entry_size: 47736 Cc: stable@vger.kernel.org Reported-by: Wenqing Liu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 052b51c891b5..895162e8aff4 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -682,8 +682,17 @@ static int __f2fs_setxattr(struct inode *inode, int index, } last = here; - while (!IS_XATTR_LAST_ENTRY(last)) + while (!IS_XATTR_LAST_ENTRY(last)) { + if ((void *)(last) + sizeof(__u32) > last_base_addr || + (void *)XATTR_NEXT_ENTRY(last) > last_base_addr) { + f2fs_err(F2FS_I_SB(inode), "inode (%lu) has invalid last xattr entry, entry_size: %zu", + inode->i_ino, ENTRY_SIZE(last)); + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); + error = -EFSCORRUPTED; + goto exit; + } last = XATTR_NEXT_ENTRY(last); + } newsize = XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + len + size); From e9310a571541448424a59f22ff964dd42081d7b5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 12 Dec 2021 17:16:56 +0800 Subject: [PATCH 358/580] f2fs: clean up __find_inline_xattr() with __find_xattr() Just cleanup, no logic change. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 895162e8aff4..d125f8c61bf5 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -224,15 +224,18 @@ static inline const struct xattr_handler *f2fs_xattr_handler(int index) } static struct f2fs_xattr_entry *__find_xattr(void *base_addr, - void *last_base_addr, int index, - size_t len, const char *name) + void *last_base_addr, void **last_addr, + int index, size_t len, const char *name) { struct f2fs_xattr_entry *entry; list_for_each_xattr(entry, base_addr) { if ((void *)(entry) + sizeof(__u32) > last_base_addr || - (void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) + (void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) { + if (last_addr) + *last_addr = entry; return NULL; + } if (entry->e_name_index != index) continue; @@ -252,19 +255,9 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode, unsigned int inline_size = inline_xattr_size(inode); void *max_addr = base_addr + inline_size; - list_for_each_xattr(entry, base_addr) { - if ((void *)entry + sizeof(__u32) > max_addr || - (void *)XATTR_NEXT_ENTRY(entry) > max_addr) { - *last_addr = entry; - return NULL; - } - if (entry->e_name_index != index) - continue; - if (entry->e_name_len != len) - continue; - if (!memcmp(entry->e_name, name, len)) - break; - } + entry = __find_xattr(base_addr, max_addr, last_addr, index, len, name); + if (!entry) + return NULL; /* inline xattr header or entry across max inline xattr size */ if (IS_XATTR_LAST_ENTRY(entry) && @@ -366,7 +359,7 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, else cur_addr = txattr_addr; - *xe = __find_xattr(cur_addr, last_txattr_addr, index, len, name); + *xe = __find_xattr(cur_addr, last_txattr_addr, NULL, index, len, name); if (!*xe) { f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr", inode->i_ino); @@ -657,7 +650,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, last_base_addr = (void *)base_addr + XATTR_SIZE(inode); /* find entry with wanted name. */ - here = __find_xattr(base_addr, last_base_addr, index, len, name); + here = __find_xattr(base_addr, last_base_addr, NULL, index, len, name); if (!here) { f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr", inode->i_ino); From c2c8c7cb9abf17740e19971bbb6facd6ce538a66 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 12 Dec 2021 17:17:51 +0800 Subject: [PATCH 359/580] f2fs: support fault injection to f2fs_trylock_op() f2fs: support fault injection for f2fs_trylock_op() This patch supports to inject fault into f2fs_trylock_op(). Usage: a) echo 65536 > /sys/fs/f2fs//inject_type or b) mount -o fault_type=65536 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 1 + fs/f2fs/f2fs.h | 5 +++++ fs/f2fs/super.c | 1 + 3 files changed, 7 insertions(+) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 6954c04753ad..5fda320354a6 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -198,6 +198,7 @@ fault_type=%d Support configuring fault injection type, should be FAULT_WRITE_IO 0x000004000 FAULT_SLAB_ALLOC 0x000008000 FAULT_DQUOT_INIT 0x000010000 + FAULT_LOCK_OP 0x000020000 =================== =========== mode=%s Control block allocation mode which supports "adaptive" and "lfs". In "lfs" mode, there should be no random diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8b8a79615ff0..217d8533a70b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -55,6 +55,7 @@ enum { FAULT_WRITE_IO, FAULT_SLAB_ALLOC, FAULT_DQUOT_INIT, + FAULT_LOCK_OP, FAULT_MAX, }; @@ -2107,6 +2108,10 @@ static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi) { + if (time_to_inject(sbi, FAULT_LOCK_OP)) { + f2fs_show_injection_info(sbi, FAULT_LOCK_OP); + return 0; + } return down_read_trylock(&sbi->cp_rwsem); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 25408b3671e0..c0aee1b29fd0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -58,6 +58,7 @@ const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_WRITE_IO] = "write IO error", [FAULT_SLAB_ALLOC] = "slab alloc", [FAULT_DQUOT_INIT] = "dquot initialize", + [FAULT_LOCK_OP] = "lock_op", }; void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, From b70b9ac76203f222477d5b725238908a349e47db Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 12 Dec 2021 20:28:12 +0800 Subject: [PATCH 360/580] f2fs: fix to check available space of CP area correctly in update_ckpt_flags() Otherwise, nat_bit area may be persisted across boundary of CP area during nat_bit rebuilding. Fixes: 94c821fb286b ("f2fs: rebuild nat_bits during umount") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index acccfe47fb7c..5de8a1c8f2a4 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1302,8 +1302,8 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) unsigned long flags; if (cpc->reason & CP_UMOUNT) { - if (le32_to_cpu(ckpt->cp_pack_total_block_count) > - sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) { + if (le32_to_cpu(ckpt->cp_pack_total_block_count) + + NM_I(sbi)->nat_bits_blocks > sbi->blocks_per_seg) { clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG); f2fs_notice(sbi, "Disable nat_bits due to no space"); } else if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG) && From 0e2bdb95587b2bc1c01c35999d7449f1c157df2a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 11 Dec 2021 21:27:36 +0800 Subject: [PATCH 361/580] f2fs: fix to reserve space for IO align feature https://bugzilla.kernel.org/show_bug.cgi?id=204137 With below script, we will hit panic during new segment allocation: DISK=bingo.img MOUNT_DIR=/mnt/f2fs dd if=/dev/zero of=$DISK bs=1M count=105 mkfs.f2fe -a 1 -o 19 -t 1 -z 1 -f -q $DISK mount -t f2fs $DISK $MOUNT_DIR -o "noinline_dentry,flush_merge,noextent_cache,mode=lfs,io_bits=7,fsync_mode=strict" for (( i = 0; i < 4096; i++ )); do name=`head /dev/urandom | tr -dc A-Za-z0-9 | head -c 10` mkdir $MOUNT_DIR/$name done umount $MOUNT_DIR rm $DISK --- Core dump --- Call Trace: allocate_segment_by_default+0x9d/0x100 [f2fs] f2fs_allocate_data_block+0x3c0/0x5c0 [f2fs] do_write_page+0x62/0x110 [f2fs] f2fs_outplace_write_data+0x43/0xc0 [f2fs] f2fs_do_write_data_page+0x386/0x560 [f2fs] __write_data_page+0x706/0x850 [f2fs] f2fs_write_cache_pages+0x267/0x6a0 [f2fs] f2fs_write_data_pages+0x19c/0x2e0 [f2fs] do_writepages+0x1c/0x70 __filemap_fdatawrite_range+0xaa/0xe0 filemap_fdatawrite+0x1f/0x30 f2fs_sync_dirty_inodes+0x74/0x1f0 [f2fs] block_operations+0xdc/0x350 [f2fs] f2fs_write_checkpoint+0x104/0x1150 [f2fs] f2fs_sync_fs+0xa2/0x120 [f2fs] f2fs_balance_fs_bg+0x33c/0x390 [f2fs] f2fs_write_node_pages+0x4c/0x1f0 [f2fs] do_writepages+0x1c/0x70 __writeback_single_inode+0x45/0x320 writeback_sb_inodes+0x273/0x5c0 wb_writeback+0xff/0x2e0 wb_workfn+0xa1/0x370 process_one_work+0x138/0x350 worker_thread+0x4d/0x3d0 kthread+0x109/0x140 ret_from_fork+0x25/0x30 The root cause here is, with IO alignment feature enables, in worst case, we need F2FS_IO_SIZE() free blocks space for single one 4k write due to IO alignment feature will fill dummy pages to make IO being aligned. So we will easily run out of free segments during non-inline directory's data writeback, even in process of foreground GC. In order to fix this issue, I just propose to reserve additional free space for IO alignment feature to handle worst case of free space usage ratio during FGGC. Fixes: 0a595ebaaa6b ("f2fs: support IO alignment for DATA and NODE writes") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 11 +++++++++++ fs/f2fs/segment.h | 3 ++- fs/f2fs/super.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ fs/f2fs/sysfs.c | 4 +++- 4 files changed, 60 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 217d8533a70b..81600939329f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1025,6 +1025,7 @@ struct f2fs_sm_info { unsigned int segment_count; /* total # of segments */ unsigned int main_segments; /* # of segments in main area */ unsigned int reserved_segments; /* # of reserved segments */ + unsigned int additional_reserved_segments;/* reserved segs for IO align feature */ unsigned int ovp_segments; /* # of overprovision segments */ /* a threshold to reclaim prefree segments */ @@ -2217,6 +2218,11 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, if (!__allow_reserved_blocks(sbi, inode, true)) avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks; + + if (F2FS_IO_ALIGNED(sbi)) + avail_user_block_count -= sbi->blocks_per_seg * + SM_I(sbi)->additional_reserved_segments; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { if (avail_user_block_count > sbi->unusable_block_count) avail_user_block_count -= sbi->unusable_block_count; @@ -2463,6 +2469,11 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, if (!__allow_reserved_blocks(sbi, inode, false)) valid_block_count += F2FS_OPTION(sbi).root_reserved_blocks; + + if (F2FS_IO_ALIGNED(sbi)) + valid_block_count += sbi->blocks_per_seg * + SM_I(sbi)->additional_reserved_segments; + user_block_count = sbi->user_block_count; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) user_block_count -= sbi->unusable_block_count; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 355b797ab171..3c01315931f6 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -538,7 +538,8 @@ static inline unsigned int free_segments(struct f2fs_sb_info *sbi) static inline unsigned int reserved_segments(struct f2fs_sb_info *sbi) { - return SM_I(sbi)->reserved_segments; + return SM_I(sbi)->reserved_segments + + SM_I(sbi)->additional_reserved_segments; } static inline unsigned int free_sections(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index c0aee1b29fd0..94c5166b5a31 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -326,6 +326,46 @@ static inline void limit_reserve_root(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).s_resgid)); } +static inline int adjust_reserved_segment(struct f2fs_sb_info *sbi) +{ + unsigned int sec_blks = sbi->blocks_per_seg * sbi->segs_per_sec; + unsigned int avg_vblocks; + unsigned int wanted_reserved_segments; + block_t avail_user_block_count; + + if (!F2FS_IO_ALIGNED(sbi)) + return 0; + + /* average valid block count in section in worst case */ + avg_vblocks = sec_blks / F2FS_IO_SIZE(sbi); + + /* + * we need enough free space when migrating one section in worst case + */ + wanted_reserved_segments = (F2FS_IO_SIZE(sbi) / avg_vblocks) * + reserved_segments(sbi); + wanted_reserved_segments -= reserved_segments(sbi); + + avail_user_block_count = sbi->user_block_count - + sbi->current_reserved_blocks - + F2FS_OPTION(sbi).root_reserved_blocks; + + if (wanted_reserved_segments * sbi->blocks_per_seg > + avail_user_block_count) { + f2fs_err(sbi, "IO align feature can't grab additional reserved segment: %u, available segments: %u", + wanted_reserved_segments, + avail_user_block_count >> sbi->log_blocks_per_seg); + return -ENOSPC; + } + + SM_I(sbi)->additional_reserved_segments = wanted_reserved_segments; + + f2fs_info(sbi, "IO align feature needs additional reserved segment: %u", + wanted_reserved_segments); + + return 0; +} + static inline void adjust_unusable_cap_perc(struct f2fs_sb_info *sbi) { if (!F2FS_OPTION(sbi).unusable_cap_perc) @@ -4143,6 +4183,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) goto free_nm; } + err = adjust_reserved_segment(sbi); + if (err) + goto free_nm; + /* For write statistics */ sbi->sectors_written_start = f2fs_get_sectors_written(sbi); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index adf1cf503bbc..3f11a27b1436 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -427,7 +427,9 @@ static ssize_t __sbi_store(struct f2fs_attr *a, if (a->struct_type == RESERVED_BLOCKS) { spin_lock(&sbi->stat_lock); if (t > (unsigned long)(sbi->user_block_count - - F2FS_OPTION(sbi).root_reserved_blocks)) { + F2FS_OPTION(sbi).root_reserved_blocks - + sbi->blocks_per_seg * + SM_I(sbi)->additional_reserved_segments)) { spin_unlock(&sbi->stat_lock); return -EINVAL; } From 4e784cb4442fb224383d5a831230fbb330322720 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 16 Dec 2021 17:13:56 +0800 Subject: [PATCH 362/580] f2fs: don't drop compressed page cache in .{invalidate,release}page For compressed inode, in .{invalidate,release}page, we will call f2fs_invalidate_compress_pages() to drop all compressed page cache of current inode. But we don't need to drop compressed page cache synchronously in .invalidatepage, because, all trancation paths of compressed physical block has been covered with f2fs_invalidate_compress_page(). And also we don't need to drop compressed page cache synchronously in .releasepage, because, if there is out-of-memory, we can count on page cache reclaim on sbi->compress_inode. BTW, this patch may fix the issue reported below: https://lore.kernel.org/linux-f2fs-devel/20211202092812.197647-1-changfengnan@vivo.com/T/#u Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index c17b8a82fef6..9df0cbda8369 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3722,12 +3722,9 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, clear_page_private_gcing(page); - if (test_opt(sbi, COMPRESS_CACHE)) { - if (f2fs_compressed_file(inode)) - f2fs_invalidate_compress_pages(sbi, inode->i_ino); - if (inode->i_ino == F2FS_COMPRESS_INO(sbi)) - clear_page_private_data(page); - } + if (test_opt(sbi, COMPRESS_CACHE) && + inode->i_ino == F2FS_COMPRESS_INO(sbi)) + clear_page_private_data(page); if (page_private_atomic(page)) return f2fs_drop_inmem_page(inode, page); @@ -3747,12 +3744,9 @@ int f2fs_release_page(struct page *page, gfp_t wait) return 0; if (test_opt(F2FS_P_SB(page), COMPRESS_CACHE)) { - struct f2fs_sb_info *sbi = F2FS_P_SB(page); struct inode *inode = page->mapping->host; - if (f2fs_compressed_file(inode)) - f2fs_invalidate_compress_pages(sbi, inode->i_ino); - if (inode->i_ino == F2FS_COMPRESS_INO(sbi)) + if (inode->i_ino == F2FS_COMPRESS_INO(F2FS_I_SB(inode))) clear_page_private_data(page); } From 2a3b4f5bf558921946b11c117baa849ea7a1ed04 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Wed, 15 Dec 2021 10:38:58 +0800 Subject: [PATCH 363/580] f2fs: Simplify bool conversion Fix the following coccicheck warning: ./fs/f2fs/sysfs.c:491:41-46: WARNING: conversion to bool not needed here Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 3f11a27b1436..3491fa294ce2 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -494,7 +494,7 @@ static ssize_t __sbi_store(struct f2fs_attr *a, if (!strcmp(a->attr.name, "gc_urgent_high_remaining")) { spin_lock(&sbi->gc_urgent_high_lock); - sbi->gc_urgent_high_limited = t == 0 ? false : true; + sbi->gc_urgent_high_limited = t != 0; sbi->gc_urgent_high_remaining = t; spin_unlock(&sbi->gc_urgent_high_lock); From 4773a0b30a365549cdaaeb4f9ff0803254a37bc5 Mon Sep 17 00:00:00 2001 From: Fengnan Chang Date: Wed, 29 Dec 2021 17:47:00 +0800 Subject: [PATCH 364/580] f2fs: remove redunant invalidate compress pages Compress page will invalidate in truncate block process too, so remove redunant invalidate compress pages in f2fs_evict_inode. Signed-off-by: Fengnan Chang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 32c7b525b641..edfca0836f26 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -751,7 +751,8 @@ void f2fs_evict_inode(struct inode *inode) trace_f2fs_evict_inode(inode); truncate_inode_pages_final(&inode->i_data); - if (test_opt(sbi, COMPRESS_CACHE) && f2fs_compressed_file(inode)) + if ((inode->i_nlink || is_bad_inode(inode)) && + test_opt(sbi, COMPRESS_CACHE) && f2fs_compressed_file(inode)) f2fs_invalidate_compress_pages(sbi, inode->i_ino); if (inode->i_ino == F2FS_NODE_INO(sbi) || From 80a2667babab8bd3f8aa4b54a4188788f03ae591 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 7 Jan 2022 20:08:45 -0800 Subject: [PATCH 365/580] f2fs: do not allow partial truncation on pinned file If the pinned file has a hole by partial truncation, application that has the block map will be broken. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index feda1d60469b..82c01cc8624d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1767,7 +1767,11 @@ static long f2fs_fallocate(struct file *file, int mode, (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) return -EOPNOTSUPP; - if (f2fs_compressed_file(inode) && + /* + * Pinned file should not support partial trucation since the block + * can be used by applications. + */ + if ((f2fs_compressed_file(inode) || f2fs_is_pinned_file(inode)) && (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE))) return -EOPNOTSUPP; From 896c22c16f28c6baab36753fb420685e540a73a1 Mon Sep 17 00:00:00 2001 From: Tim Murray Date: Fri, 7 Jan 2022 12:48:44 -0800 Subject: [PATCH 366/580] f2fs: move f2fs to use reader-unfair rwsems f2fs rw_semaphores work better if writers can starve readers, especially for the checkpoint thread, because writers are strictly more important than reader threads. This prevents significant priority inversion between low-priority readers that blocked while trying to acquire the read lock and a second acquisition of the write lock that might be blocking high priority work. Signed-off-by: Tim Murray Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 34 +++++----- fs/f2fs/compress.c | 6 +- fs/f2fs/data.c | 72 ++++++++++----------- fs/f2fs/dir.c | 12 ++-- fs/f2fs/f2fs.h | 112 +++++++++++++++++++++++++------- fs/f2fs/file.c | 150 +++++++++++++++++++++---------------------- fs/f2fs/gc.c | 46 ++++++------- fs/f2fs/inline.c | 4 +- fs/f2fs/namei.c | 34 +++++----- fs/f2fs/node.c | 84 ++++++++++++------------ fs/f2fs/recovery.c | 4 +- fs/f2fs/segment.c | 44 ++++++------- fs/f2fs/super.c | 58 ++++++++--------- fs/f2fs/sysfs.c | 4 +- fs/f2fs/verity.c | 4 +- fs/f2fs/xattr.c | 12 ++-- 16 files changed, 374 insertions(+), 306 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 5de8a1c8f2a4..c7a6d48e0da4 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -351,13 +351,13 @@ static int f2fs_write_meta_pages(struct address_space *mapping, goto skip_write; /* if locked failed, cp will flush dirty pages instead */ - if (!down_write_trylock(&sbi->cp_global_sem)) + if (!f2fs_down_write_trylock(&sbi->cp_global_sem)) goto skip_write; trace_f2fs_writepages(mapping->host, wbc, META); diff = nr_pages_to_write(sbi, META, wbc); written = f2fs_sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO); - up_write(&sbi->cp_global_sem); + f2fs_up_write(&sbi->cp_global_sem); wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); return 0; @@ -1159,7 +1159,7 @@ static bool __need_flush_quota(struct f2fs_sb_info *sbi) if (!is_journalled_quota(sbi)) return false; - if (!down_write_trylock(&sbi->quota_sem)) + if (!f2fs_down_write_trylock(&sbi->quota_sem)) return true; if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) { ret = false; @@ -1171,7 +1171,7 @@ static bool __need_flush_quota(struct f2fs_sb_info *sbi) } else if (get_pages(sbi, F2FS_DIRTY_QDATA)) { ret = true; } - up_write(&sbi->quota_sem); + f2fs_up_write(&sbi->quota_sem); return ret; } @@ -1228,10 +1228,10 @@ static int block_operations(struct f2fs_sb_info *sbi) * POR: we should ensure that there are no dirty node pages * until finishing nat/sit flush. inode->i_blocks can be updated. */ - down_write(&sbi->node_change); + f2fs_down_write(&sbi->node_change); if (get_pages(sbi, F2FS_DIRTY_IMETA)) { - up_write(&sbi->node_change); + f2fs_up_write(&sbi->node_change); f2fs_unlock_all(sbi); err = f2fs_sync_inode_meta(sbi); if (err) @@ -1241,15 +1241,15 @@ static int block_operations(struct f2fs_sb_info *sbi) } retry_flush_nodes: - down_write(&sbi->node_write); + f2fs_down_write(&sbi->node_write); if (get_pages(sbi, F2FS_DIRTY_NODES)) { - up_write(&sbi->node_write); + f2fs_up_write(&sbi->node_write); atomic_inc(&sbi->wb_sync_req[NODE]); err = f2fs_sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO); atomic_dec(&sbi->wb_sync_req[NODE]); if (err) { - up_write(&sbi->node_change); + f2fs_up_write(&sbi->node_change); f2fs_unlock_all(sbi); return err; } @@ -1262,13 +1262,13 @@ static int block_operations(struct f2fs_sb_info *sbi) * dirty node blocks and some checkpoint values by block allocation. */ __prepare_cp_block(sbi); - up_write(&sbi->node_change); + f2fs_up_write(&sbi->node_change); return err; } static void unblock_operations(struct f2fs_sb_info *sbi) { - up_write(&sbi->node_write); + f2fs_up_write(&sbi->node_write); f2fs_unlock_all(sbi); } @@ -1612,7 +1612,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_warn(sbi, "Start checkpoint disabled!"); } if (cpc->reason != CP_RESIZE) - down_write(&sbi->cp_global_sem); + f2fs_down_write(&sbi->cp_global_sem); if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) && ((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) || @@ -1693,7 +1693,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); out: if (cpc->reason != CP_RESIZE) - up_write(&sbi->cp_global_sem); + f2fs_up_write(&sbi->cp_global_sem); return err; } @@ -1741,9 +1741,9 @@ static int __write_checkpoint_sync(struct f2fs_sb_info *sbi) struct cp_control cpc = { .reason = CP_SYNC, }; int err; - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); err = f2fs_write_checkpoint(sbi, &cpc); - up_write(&sbi->gc_lock); + f2fs_up_write(&sbi->gc_lock); return err; } @@ -1831,9 +1831,9 @@ int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi) if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC) { int ret; - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); ret = f2fs_write_checkpoint(sbi, &cpc); - up_write(&sbi->gc_lock); + f2fs_up_write(&sbi->gc_lock); return ret; } diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index f0a7aba740d6..e13f3cb1eef2 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1237,7 +1237,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, * checkpoint. This can only happen to quota writes which can cause * the below discard race condition. */ - down_read(&sbi->node_write); + f2fs_down_read(&sbi->node_write); } else if (!f2fs_trylock_op(sbi)) { goto out_free; } @@ -1354,7 +1354,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, f2fs_put_dnode(&dn); if (IS_NOQUOTA(inode)) - up_read(&sbi->node_write); + f2fs_up_read(&sbi->node_write); else f2fs_unlock_op(sbi); @@ -1380,7 +1380,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, f2fs_put_dnode(&dn); out_unlock_op: if (IS_NOQUOTA(inode)) - up_read(&sbi->node_write); + f2fs_up_read(&sbi->node_write); else f2fs_unlock_op(sbi); out_free: diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9df0cbda8369..e86a72c0b06c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -571,7 +571,7 @@ static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type btype = PAGE_TYPE_OF_BIO(type); struct f2fs_bio_info *io = sbi->write_io[btype] + temp; - down_write(&io->io_rwsem); + f2fs_down_write(&io->io_rwsem); /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { @@ -582,7 +582,7 @@ static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, io->fio.op_flags |= REQ_PREFLUSH | REQ_FUA; } __submit_merged_bio(io); - up_write(&io->io_rwsem); + f2fs_up_write(&io->io_rwsem); } static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, @@ -597,9 +597,9 @@ static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, enum page_type btype = PAGE_TYPE_OF_BIO(type); struct f2fs_bio_info *io = sbi->write_io[btype] + temp; - down_read(&io->io_rwsem); + f2fs_down_read(&io->io_rwsem); ret = __has_merged_page(io->bio, inode, page, ino); - up_read(&io->io_rwsem); + f2fs_up_read(&io->io_rwsem); } if (ret) __f2fs_submit_merged_write(sbi, type, temp); @@ -720,9 +720,9 @@ static void add_bio_entry(struct f2fs_sb_info *sbi, struct bio *bio, if (bio_add_page(bio, page, PAGE_SIZE, 0) != PAGE_SIZE) f2fs_bug_on(sbi, 1); - down_write(&io->bio_list_lock); + f2fs_down_write(&io->bio_list_lock); list_add_tail(&be->list, &io->bio_list); - up_write(&io->bio_list_lock); + f2fs_up_write(&io->bio_list_lock); } static void del_bio_entry(struct bio_entry *be) @@ -743,7 +743,7 @@ static int add_ipu_page(struct f2fs_sb_info *sbi, struct bio **bio, struct list_head *head = &io->bio_list; struct bio_entry *be; - down_write(&io->bio_list_lock); + f2fs_down_write(&io->bio_list_lock); list_for_each_entry(be, head, list) { if (be->bio != *bio) continue; @@ -761,7 +761,7 @@ static int add_ipu_page(struct f2fs_sb_info *sbi, struct bio **bio, __submit_bio(sbi, *bio, DATA); break; } - up_write(&io->bio_list_lock); + f2fs_up_write(&io->bio_list_lock); } if (ret) { @@ -787,7 +787,7 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, if (list_empty(head)) continue; - down_read(&io->bio_list_lock); + f2fs_down_read(&io->bio_list_lock); list_for_each_entry(be, head, list) { if (target) found = (target == be->bio); @@ -797,14 +797,14 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, if (found) break; } - up_read(&io->bio_list_lock); + f2fs_up_read(&io->bio_list_lock); if (!found) continue; found = false; - down_write(&io->bio_list_lock); + f2fs_down_write(&io->bio_list_lock); list_for_each_entry(be, head, list) { if (target) found = (target == be->bio); @@ -817,7 +817,7 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, break; } } - up_write(&io->bio_list_lock); + f2fs_up_write(&io->bio_list_lock); } if (found) @@ -875,7 +875,7 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio) f2fs_bug_on(sbi, is_read_io(fio->op)); - down_write(&io->io_rwsem); + f2fs_down_write(&io->io_rwsem); next: if (fio->in_list) { spin_lock(&io->io_lock); @@ -937,7 +937,7 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio) if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || !f2fs_is_checkpoint_ready(sbi)) __submit_merged_bio(io); - up_write(&io->io_rwsem); + f2fs_up_write(&io->io_rwsem); } static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, @@ -1350,9 +1350,9 @@ void f2fs_do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) { if (flag == F2FS_GET_BLOCK_PRE_AIO) { if (lock) - down_read(&sbi->node_change); + f2fs_down_read(&sbi->node_change); else - up_read(&sbi->node_change); + f2fs_up_read(&sbi->node_change); } else { if (lock) f2fs_lock_op(sbi); @@ -2802,13 +2802,13 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, * the below discard race condition. */ if (IS_NOQUOTA(inode)) - down_read(&sbi->node_write); + f2fs_down_read(&sbi->node_write); fio.need_lock = LOCK_DONE; err = f2fs_do_write_data_page(&fio); if (IS_NOQUOTA(inode)) - up_read(&sbi->node_write); + f2fs_up_read(&sbi->node_write); goto done; } @@ -3268,14 +3268,14 @@ static void f2fs_write_failed(struct inode *inode, loff_t to) /* In the fs-verity case, f2fs_end_enable_verity() does the truncate */ if (to > i_size && !f2fs_verity_in_progress(inode)) { - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_mmap_sem); truncate_pagecache(inode, i_size); f2fs_truncate_blocks(inode, i_size, true); - up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } } @@ -3643,21 +3643,21 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) iocb->ki_hint = WRITE_LIFE_NOT_SET; if (iocb->ki_flags & IOCB_NOWAIT) { - if (!down_read_trylock(&fi->i_gc_rwsem[rw])) { + if (!f2fs_down_read_trylock(&fi->i_gc_rwsem[rw])) { iocb->ki_hint = hint; err = -EAGAIN; goto out; } - if (do_opu && !down_read_trylock(&fi->i_gc_rwsem[READ])) { - up_read(&fi->i_gc_rwsem[rw]); + if (do_opu && !f2fs_down_read_trylock(&fi->i_gc_rwsem[READ])) { + f2fs_up_read(&fi->i_gc_rwsem[rw]); iocb->ki_hint = hint; err = -EAGAIN; goto out; } } else { - down_read(&fi->i_gc_rwsem[rw]); + f2fs_down_read(&fi->i_gc_rwsem[rw]); if (do_opu) - down_read(&fi->i_gc_rwsem[READ]); + f2fs_down_read(&fi->i_gc_rwsem[READ]); } err = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, @@ -3667,9 +3667,9 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) DIO_SKIP_HOLES); if (do_opu) - up_read(&fi->i_gc_rwsem[READ]); + f2fs_up_read(&fi->i_gc_rwsem[READ]); - up_read(&fi->i_gc_rwsem[rw]); + f2fs_up_read(&fi->i_gc_rwsem[rw]); if (rw == WRITE) { if (whint_mode == WHINT_MODE_OFF) @@ -3928,13 +3928,13 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk, unsigned int end_sec = secidx + blkcnt / blk_per_sec; int ret = 0; - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_mmap_sem); set_inode_flag(inode, FI_ALIGNED_WRITE); for (; secidx < end_sec; secidx++) { - down_write(&sbi->pin_sem); + f2fs_down_write(&sbi->pin_sem); f2fs_lock_op(sbi); f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false); @@ -3948,7 +3948,7 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk, page = f2fs_get_lock_data_page(inode, blkidx, true); if (IS_ERR(page)) { - up_write(&sbi->pin_sem); + f2fs_up_write(&sbi->pin_sem); ret = PTR_ERR(page); goto done; } @@ -3961,7 +3961,7 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk, ret = filemap_fdatawrite(inode->i_mapping); - up_write(&sbi->pin_sem); + f2fs_up_write(&sbi->pin_sem); if (ret) break; @@ -3971,8 +3971,8 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk, clear_inode_flag(inode, FI_DO_DEFRAG); clear_inode_flag(inode, FI_ALIGNED_WRITE); - up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 3ea73c44906d..315c6472b3c2 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -768,7 +768,7 @@ int f2fs_add_regular_entry(struct inode *dir, const struct f2fs_filename *fname, f2fs_wait_on_page_writeback(dentry_page, DATA, true, true); if (inode) { - down_write(&F2FS_I(inode)->i_sem); + f2fs_down_write(&F2FS_I(inode)->i_sem); page = f2fs_init_inode_metadata(inode, dir, fname, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); @@ -795,7 +795,7 @@ int f2fs_add_regular_entry(struct inode *dir, const struct f2fs_filename *fname, f2fs_update_parent_metadata(dir, inode, current_depth); fail: if (inode) - up_write(&F2FS_I(inode)->i_sem); + f2fs_up_write(&F2FS_I(inode)->i_sem); f2fs_put_page(dentry_page, 1); @@ -860,7 +860,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir) struct page *page; int err = 0; - down_write(&F2FS_I(inode)->i_sem); + f2fs_down_write(&F2FS_I(inode)->i_sem); page = f2fs_init_inode_metadata(inode, dir, NULL, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); @@ -871,7 +871,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir) clear_inode_flag(inode, FI_NEW_INODE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); fail: - up_write(&F2FS_I(inode)->i_sem); + f2fs_up_write(&F2FS_I(inode)->i_sem); return err; } @@ -879,7 +879,7 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); - down_write(&F2FS_I(inode)->i_sem); + f2fs_down_write(&F2FS_I(inode)->i_sem); if (S_ISDIR(inode->i_mode)) f2fs_i_links_write(dir, false); @@ -890,7 +890,7 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode) f2fs_i_links_write(inode, false); f2fs_i_size_write(inode, 0); } - up_write(&F2FS_I(inode)->i_sem); + f2fs_up_write(&F2FS_I(inode)->i_sem); if (inode->i_nlink == 0) f2fs_add_orphan_inode(inode); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 81600939329f..990f3a70c9e2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -120,6 +120,18 @@ typedef u32 nid_t; #define COMPRESS_EXT_NUM 16 +/* + * An implementation of an rwsem that is explicitly unfair to readers. This + * prevents priority inversion when a low-priority reader acquires the read lock + * while sleeping on the write lock but the write lock is needed by + * higher-priority clients. + */ + +struct f2fs_rwsem { + struct rw_semaphore internal_rwsem; + wait_queue_head_t read_waiters; +}; + struct f2fs_mount_info { unsigned int opt; int write_io_size_bits; /* Write IO size bits */ @@ -749,7 +761,7 @@ struct f2fs_inode_info { /* Use below internally in f2fs*/ unsigned long flags[BITS_TO_LONGS(FI_MAX)]; /* use to pass per-file flags */ - struct rw_semaphore i_sem; /* protect fi info */ + struct f2fs_rwsem i_sem; /* protect fi info */ atomic_t dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ @@ -775,9 +787,9 @@ struct f2fs_inode_info { struct extent_tree *extent_tree; /* cached extent_tree entry */ /* avoid racing between foreground op and gc */ - struct rw_semaphore i_gc_rwsem[2]; - struct rw_semaphore i_mmap_sem; - struct rw_semaphore i_xattr_sem; /* avoid racing between reading and changing EAs */ + struct f2fs_rwsem i_gc_rwsem[2]; + struct f2fs_rwsem i_mmap_sem; + struct f2fs_rwsem i_xattr_sem; /* avoid racing between reading and changing EAs */ int i_extra_isize; /* size of extra space located in i_addr */ kprojid_t i_projid; /* id for project quota */ @@ -903,7 +915,7 @@ struct f2fs_nm_info { /* NAT cache management */ struct radix_tree_root nat_root;/* root of the nat entry cache */ struct radix_tree_root nat_set_root;/* root of the nat set cache */ - struct rw_semaphore nat_tree_lock; /* protect nat entry tree */ + struct f2fs_rwsem nat_tree_lock; /* protect nat entry tree */ struct list_head nat_entries; /* cached nat entry list (clean) */ spinlock_t nat_list_lock; /* protect clean nat entry list */ unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */ @@ -1016,7 +1028,7 @@ struct f2fs_sm_info { struct dirty_seglist_info *dirty_info; /* dirty segment information */ struct curseg_info *curseg_array; /* active segment information */ - struct rw_semaphore curseg_lock; /* for preventing curseg change */ + struct f2fs_rwsem curseg_lock; /* for preventing curseg change */ block_t seg0_blkaddr; /* block address of 0'th segment */ block_t main_blkaddr; /* start block address of main area */ @@ -1200,11 +1212,11 @@ struct f2fs_bio_info { struct bio *bio; /* bios to merge */ sector_t last_block_in_bio; /* last block number */ struct f2fs_io_info fio; /* store buffered io info. */ - struct rw_semaphore io_rwsem; /* blocking op for bio */ + struct f2fs_rwsem io_rwsem; /* blocking op for bio */ spinlock_t io_lock; /* serialize DATA/NODE IOs */ struct list_head io_list; /* track fios */ struct list_head bio_list; /* bio entry list head */ - struct rw_semaphore bio_list_lock; /* lock to protect bio entry list */ + struct f2fs_rwsem bio_list_lock; /* lock to protect bio entry list */ }; #define FDEV(i) (sbi->devs[i]) @@ -1575,7 +1587,7 @@ struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ struct f2fs_super_block *raw_super; /* raw super block pointer */ - struct rw_semaphore sb_lock; /* lock for raw super block */ + struct f2fs_rwsem sb_lock; /* lock for raw super block */ int valid_super_block; /* valid super block no */ unsigned long s_flag; /* flags for sbi */ struct mutex writepages; /* mutex for writepages() */ @@ -1595,7 +1607,7 @@ struct f2fs_sb_info { /* for bio operations */ struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */ /* keep migration IO order for LFS mode */ - struct rw_semaphore io_order_lock; + struct f2fs_rwsem io_order_lock; mempool_t *write_io_dummy; /* Dummy pages */ /* for checkpoint */ @@ -1603,10 +1615,10 @@ struct f2fs_sb_info { int cur_cp_pack; /* remain current cp pack */ spinlock_t cp_lock; /* for flag in ckpt */ struct inode *meta_inode; /* cache meta blocks */ - struct rw_semaphore cp_global_sem; /* checkpoint procedure lock */ - struct rw_semaphore cp_rwsem; /* blocking FS operations */ - struct rw_semaphore node_write; /* locking node writes */ - struct rw_semaphore node_change; /* locking node change */ + struct f2fs_rwsem cp_global_sem; /* checkpoint procedure lock */ + struct f2fs_rwsem cp_rwsem; /* blocking FS operations */ + struct f2fs_rwsem node_write; /* locking node writes */ + struct f2fs_rwsem node_change; /* locking node change */ wait_queue_head_t cp_wait; unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ long interval_time[MAX_TIME]; /* to store thresholds */ @@ -1666,7 +1678,7 @@ struct f2fs_sb_info { block_t unusable_block_count; /* # of blocks saved by last cp */ unsigned int nquota_files; /* # of quota sysfile */ - struct rw_semaphore quota_sem; /* blocking cp for flags */ + struct f2fs_rwsem quota_sem; /* blocking cp for flags */ /* # of pages, see count_type */ atomic_t nr_pages[NR_COUNT_TYPE]; @@ -1682,7 +1694,7 @@ struct f2fs_sb_info { struct f2fs_mount_info mount_opt; /* mount options */ /* for cleaning operations */ - struct rw_semaphore gc_lock; /* + struct f2fs_rwsem gc_lock; /* * semaphore for GC, avoid * race between GC and GC or CP */ @@ -1702,7 +1714,7 @@ struct f2fs_sb_info { /* threshold for gc trials on pinned files */ u64 gc_pin_file_threshold; - struct rw_semaphore pin_sem; + struct f2fs_rwsem pin_sem; /* maximum # of trials to find a victim segment for SSR and GC */ unsigned int max_victim_search; @@ -2102,9 +2114,65 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) spin_unlock_irqrestore(&sbi->cp_lock, flags); } +static inline void init_f2fs_rwsem(struct f2fs_rwsem *sem) +{ + init_rwsem(&sem->internal_rwsem); + init_waitqueue_head(&sem->read_waiters); +} + +static inline int f2fs_rwsem_is_locked(struct f2fs_rwsem *sem) +{ + return rwsem_is_locked(&sem->internal_rwsem); +} + +static inline int f2fs_rwsem_is_contended(struct f2fs_rwsem *sem) +{ + return rwsem_is_contended(&sem->internal_rwsem); +} + +static inline void f2fs_down_read(struct f2fs_rwsem *sem) +{ + wait_event(sem->read_waiters, down_read_trylock(&sem->internal_rwsem)); +} + +static inline int f2fs_down_read_trylock(struct f2fs_rwsem *sem) +{ + return down_read_trylock(&sem->internal_rwsem); +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static inline void f2fs_down_read_nested(struct f2fs_rwsem *sem, int subclass) +{ + down_read_nested(&sem->internal_rwsem, subclass); +} +#else +#define f2fs_down_read_nested(sem, subclass) f2fs_down_read(sem) +#endif + +static inline void f2fs_up_read(struct f2fs_rwsem *sem) +{ + up_read(&sem->internal_rwsem); +} + +static inline void f2fs_down_write(struct f2fs_rwsem *sem) +{ + down_write(&sem->internal_rwsem); +} + +static inline int f2fs_down_write_trylock(struct f2fs_rwsem *sem) +{ + return down_write_trylock(&sem->internal_rwsem); +} + +static inline void f2fs_up_write(struct f2fs_rwsem *sem) +{ + up_write(&sem->internal_rwsem); + wake_up_all(&sem->read_waiters); +} + static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { - down_read(&sbi->cp_rwsem); + f2fs_down_read(&sbi->cp_rwsem); } static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi) @@ -2113,22 +2181,22 @@ static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi) f2fs_show_injection_info(sbi, FAULT_LOCK_OP); return 0; } - return down_read_trylock(&sbi->cp_rwsem); + return f2fs_down_read_trylock(&sbi->cp_rwsem); } static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi) { - up_read(&sbi->cp_rwsem); + f2fs_up_read(&sbi->cp_rwsem); } static inline void f2fs_lock_all(struct f2fs_sb_info *sbi) { - down_write(&sbi->cp_rwsem); + f2fs_down_write(&sbi->cp_rwsem); } static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) { - up_write(&sbi->cp_rwsem); + f2fs_up_write(&sbi->cp_rwsem); } static inline int __get_cp_reason(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 82c01cc8624d..f9d241e5f387 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -38,9 +38,9 @@ static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf) struct inode *inode = file_inode(vmf->vma->vm_file); vm_fault_t ret; - down_read(&F2FS_I(inode)->i_mmap_sem); + f2fs_down_read(&F2FS_I(inode)->i_mmap_sem); ret = filemap_fault(vmf); - up_read(&F2FS_I(inode)->i_mmap_sem); + f2fs_up_read(&F2FS_I(inode)->i_mmap_sem); if (!ret) f2fs_update_iostat(F2FS_I_SB(inode), APP_MAPPED_READ_IO, @@ -101,7 +101,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); file_update_time(vmf->vma->vm_file); - down_read(&F2FS_I(inode)->i_mmap_sem); + f2fs_down_read(&F2FS_I(inode)->i_mmap_sem); lock_page(page); if (unlikely(page->mapping != inode->i_mapping || page_offset(page) > i_size_read(inode) || @@ -159,7 +159,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) trace_f2fs_vm_page_mkwrite(page, DATA); out_sem: - up_read(&F2FS_I(inode)->i_mmap_sem); + f2fs_up_read(&F2FS_I(inode)->i_mmap_sem); sb_end_pagefault(inode->i_sb); err: @@ -237,13 +237,13 @@ static void try_to_fix_pino(struct inode *inode) struct f2fs_inode_info *fi = F2FS_I(inode); nid_t pino; - down_write(&fi->i_sem); + f2fs_down_write(&fi->i_sem); if (file_wrong_pino(inode) && inode->i_nlink == 1 && get_parent_ino(inode, &pino)) { f2fs_i_pino_write(inode, pino); file_got_pino(inode); } - up_write(&fi->i_sem); + f2fs_up_write(&fi->i_sem); } static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, @@ -318,9 +318,9 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, * Both of fdatasync() and fsync() are able to be recovered from * sudden-power-off. */ - down_read(&F2FS_I(inode)->i_sem); + f2fs_down_read(&F2FS_I(inode)->i_sem); cp_reason = need_do_checkpoint(inode); - up_read(&F2FS_I(inode)->i_sem); + f2fs_up_read(&F2FS_I(inode)->i_sem); if (cp_reason) { /* all the dirty node pages should be flushed for POR */ @@ -978,8 +978,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) return err; } - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_mmap_sem); truncate_setsize(inode, attr->ia_size); @@ -989,8 +989,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) * do not trim all blocks after i_size if target size is * larger than i_size. */ - up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (err) return err; @@ -1130,8 +1130,8 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) blk_start = (loff_t)pg_start << PAGE_SHIFT; blk_end = (loff_t)pg_end << PAGE_SHIFT; - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_mmap_sem); truncate_pagecache_range(inode, blk_start, blk_end - 1); @@ -1139,8 +1139,8 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) ret = f2fs_truncate_hole(inode, pg_start, pg_end); f2fs_unlock_op(sbi); - up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } } @@ -1373,8 +1373,8 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); /* avoid gc operation during block exchange */ - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_mmap_sem); f2fs_lock_op(sbi); f2fs_drop_extent_tree(inode); @@ -1382,8 +1382,8 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); f2fs_unlock_op(sbi); - up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } @@ -1413,13 +1413,13 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) return ret; /* write out all moved pages, if possible */ - down_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_down_write(&F2FS_I(inode)->i_mmap_sem); filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); truncate_pagecache(inode, offset); new_size = i_size_read(inode) - len; ret = f2fs_truncate_blocks(inode, new_size, true); - up_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_up_write(&F2FS_I(inode)->i_mmap_sem); if (!ret) f2fs_i_size_write(inode, new_size); return ret; @@ -1518,8 +1518,8 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, unsigned int end_offset; pgoff_t end; - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_mmap_sem); truncate_pagecache_range(inode, (loff_t)index << PAGE_SHIFT, @@ -1531,8 +1531,8 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE); if (ret) { f2fs_unlock_op(sbi); - up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); goto out; } @@ -1543,8 +1543,8 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); - up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); f2fs_balance_fs(sbi, dn.node_changed); @@ -1600,9 +1600,9 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); - down_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_down_write(&F2FS_I(inode)->i_mmap_sem); ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); - up_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_up_write(&F2FS_I(inode)->i_mmap_sem); if (ret) return ret; @@ -1617,8 +1617,8 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); /* avoid gc operation during block exchange */ - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_mmap_sem); truncate_pagecache(inode, offset); while (!ret && idx > pg_start) { @@ -1634,14 +1634,14 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) idx + delta, nr, false); f2fs_unlock_op(sbi); } - up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); /* write out all moved pages, if possible */ - down_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_down_write(&F2FS_I(inode)->i_mmap_sem); filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); truncate_pagecache(inode, offset); - up_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_up_write(&F2FS_I(inode)->i_mmap_sem); if (!ret) f2fs_i_size_write(inode, new_size); @@ -1691,13 +1691,13 @@ static int expand_inode_data(struct inode *inode, loff_t offset, next_alloc: if (has_not_enough_free_secs(sbi, 0, GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); if (err && err != -ENODATA && err != -EAGAIN) goto out_err; } - down_write(&sbi->pin_sem); + f2fs_down_write(&sbi->pin_sem); f2fs_lock_op(sbi); f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false); @@ -1707,7 +1707,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset, err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); file_dont_truncate(inode); - up_write(&sbi->pin_sem); + f2fs_up_write(&sbi->pin_sem); expanded += map.m_len; sec_len -= map.m_len; @@ -2095,7 +2095,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (ret) goto out; - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); /* * Should wait end_io to count F2FS_WB_CP_DATA correctly by @@ -2106,7 +2106,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) { - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); goto out; } @@ -2119,7 +2119,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) /* add inode in inmem_list first and set atomic_file */ set_inode_flag(inode, FI_ATOMIC_FILE); clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); F2FS_I(inode)->inmem_task = current; @@ -2430,7 +2430,7 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) if (err) return err; - down_write(&sbi->sb_lock); + f2fs_down_write(&sbi->sb_lock); if (uuid_is_nonzero(sbi->raw_super->encrypt_pw_salt)) goto got_it; @@ -2449,7 +2449,7 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) 16)) err = -EFAULT; out_err: - up_write(&sbi->sb_lock); + f2fs_up_write(&sbi->sb_lock); mnt_drop_write_file(filp); return err; } @@ -2526,12 +2526,12 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) return ret; if (!sync) { - if (!down_write_trylock(&sbi->gc_lock)) { + if (!f2fs_down_write_trylock(&sbi->gc_lock)) { ret = -EBUSY; goto out; } } else { - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); } ret = f2fs_gc(sbi, sync, true, false, NULL_SEGNO); @@ -2562,12 +2562,12 @@ static int __f2fs_ioc_gc_range(struct file *filp, struct f2fs_gc_range *range) do_more: if (!range->sync) { - if (!down_write_trylock(&sbi->gc_lock)) { + if (!f2fs_down_write_trylock(&sbi->gc_lock)) { ret = -EBUSY; goto out; } } else { - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); } ret = f2fs_gc(sbi, range->sync, true, false, @@ -2899,10 +2899,10 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, f2fs_balance_fs(sbi, true); - down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); if (src != dst) { ret = -EBUSY; - if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) + if (!f2fs_down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) goto out_src; } @@ -2920,9 +2920,9 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, f2fs_unlock_op(sbi); if (src != dst) - up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); out_src: - up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); out_unlock: if (src != dst) inode_unlock(dst); @@ -3017,7 +3017,7 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) end_segno = min(start_segno + range.segments, dev_end_segno); while (start_segno < end_segno) { - if (!down_write_trylock(&sbi->gc_lock)) { + if (!f2fs_down_write_trylock(&sbi->gc_lock)) { ret = -EBUSY; goto out; } @@ -3363,9 +3363,9 @@ int f2fs_precache_extents(struct inode *inode) while (map.m_lblk < end) { map.m_len = end - map.m_lblk; - down_write(&fi->i_gc_rwsem[WRITE]); + f2fs_down_write(&fi->i_gc_rwsem[WRITE]); err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_PRECACHE); - up_write(&fi->i_gc_rwsem[WRITE]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); if (err) return err; @@ -3442,11 +3442,11 @@ static int f2fs_ioc_getfslabel(struct file *filp, unsigned long arg) if (!vbuf) return -ENOMEM; - down_read(&sbi->sb_lock); + f2fs_down_read(&sbi->sb_lock); count = utf16s_to_utf8s(sbi->raw_super->volume_name, ARRAY_SIZE(sbi->raw_super->volume_name), UTF16_LITTLE_ENDIAN, vbuf, MAX_VOLUME_NAME); - up_read(&sbi->sb_lock); + f2fs_up_read(&sbi->sb_lock); if (copy_to_user((char __user *)arg, vbuf, min(FSLABEL_MAX, count))) @@ -3474,7 +3474,7 @@ static int f2fs_ioc_setfslabel(struct file *filp, unsigned long arg) if (err) goto out; - down_write(&sbi->sb_lock); + f2fs_down_write(&sbi->sb_lock); memset(sbi->raw_super->volume_name, 0, sizeof(sbi->raw_super->volume_name)); @@ -3484,7 +3484,7 @@ static int f2fs_ioc_setfslabel(struct file *filp, unsigned long arg) err = f2fs_commit_super(sbi, false); - up_write(&sbi->sb_lock); + f2fs_up_write(&sbi->sb_lock); mnt_drop_write_file(filp); out: @@ -3610,8 +3610,8 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) if (!atomic_read(&F2FS_I(inode)->i_compr_blocks)) goto out; - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_mmap_sem); last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); @@ -3646,8 +3646,8 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) released_blocks += ret; } - up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); out: inode_unlock(inode); @@ -3763,8 +3763,8 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) goto unlock_inode; } - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_mmap_sem); last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); @@ -3799,8 +3799,8 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) reserved_blocks += ret; } - up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (ret >= 0) { clear_inode_flag(inode, FI_COMPRESS_RELEASED); @@ -3918,8 +3918,8 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg) if (ret) goto err; - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_mmap_sem); ret = filemap_write_and_wait_range(mapping, range.start, to_end ? LLONG_MAX : end_addr - 1); @@ -4006,8 +4006,8 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg) ret = f2fs_secure_erase(prev_bdev, inode, prev_index, prev_block, len, range.flags); out: - up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); err: inode_unlock(inode); file_end_write(filp); @@ -4534,12 +4534,12 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) /* Don't leave any preallocated blocks around past i_size. */ if (preallocated && i_size_read(inode) < target_size) { - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_mmap_sem); if (!f2fs_truncate(inode)) file_dont_truncate(inode); - up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } else { file_dont_truncate(inode); } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index a6accec60d04..7a486bbf9c07 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -105,21 +105,21 @@ static int gc_thread_func(void *data) spin_unlock(&sbi->gc_urgent_high_lock); wait_ms = gc_th->urgent_sleep_time; - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); goto do_gc; } if (foreground) { - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); goto do_gc; - } else if (!down_write_trylock(&sbi->gc_lock)) { + } else if (!f2fs_down_write_trylock(&sbi->gc_lock)) { stat_other_skip_bggc_count(sbi); goto next; } if (!is_idle(sbi, GC_TIME)) { increase_sleep_time(gc_th, &wait_ms); - up_write(&sbi->gc_lock); + f2fs_up_write(&sbi->gc_lock); stat_io_skip_bggc_count(sbi); goto next; } @@ -1230,7 +1230,7 @@ static int move_data_block(struct inode *inode, block_t bidx, fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; if (lfs_mode) - down_write(&fio.sbi->io_order_lock); + f2fs_down_write(&fio.sbi->io_order_lock); mpage = f2fs_grab_cache_page(META_MAPPING(fio.sbi), fio.old_blkaddr, false); @@ -1316,7 +1316,7 @@ static int move_data_block(struct inode *inode, block_t bidx, true, true, true); up_out: if (lfs_mode) - up_write(&fio.sbi->io_order_lock); + f2fs_up_write(&fio.sbi->io_order_lock); put_out: f2fs_put_dnode(&dn); out: @@ -1476,7 +1476,7 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, special_file(inode->i_mode)) continue; - if (!down_write_trylock( + if (!f2fs_down_write_trylock( &F2FS_I(inode)->i_gc_rwsem[WRITE])) { iput(inode); sbi->skipped_gc_rwsem++; @@ -1489,7 +1489,7 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (f2fs_post_read_required(inode)) { int err = ra_data_block(inode, start_bidx); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (err) { iput(inode); continue; @@ -1500,7 +1500,7 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, data_page = f2fs_get_read_data_page(inode, start_bidx, REQ_RAHEAD, true); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (IS_ERR(data_page)) { iput(inode); continue; @@ -1519,14 +1519,14 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, int err; if (S_ISREG(inode->i_mode)) { - if (!down_write_trylock(&fi->i_gc_rwsem[READ])) { + if (!f2fs_down_write_trylock(&fi->i_gc_rwsem[READ])) { sbi->skipped_gc_rwsem++; continue; } - if (!down_write_trylock( + if (!f2fs_down_write_trylock( &fi->i_gc_rwsem[WRITE])) { sbi->skipped_gc_rwsem++; - up_write(&fi->i_gc_rwsem[READ]); + f2fs_up_write(&fi->i_gc_rwsem[READ]); continue; } locked = true; @@ -1549,8 +1549,8 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, submitted++; if (locked) { - up_write(&fi->i_gc_rwsem[WRITE]); - up_write(&fi->i_gc_rwsem[READ]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + f2fs_up_write(&fi->i_gc_rwsem[READ]); } stat_inc_data_blk_count(sbi, 1, gc_type); @@ -1808,7 +1808,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, reserved_segments(sbi), prefree_segments(sbi)); - up_write(&sbi->gc_lock); + f2fs_up_write(&sbi->gc_lock); put_gc_inode(&gc_list); @@ -1937,7 +1937,7 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs) long long block_count; int segs = secs * sbi->segs_per_sec; - down_write(&sbi->sb_lock); + f2fs_down_write(&sbi->sb_lock); section_count = le32_to_cpu(raw_sb->section_count); segment_count = le32_to_cpu(raw_sb->segment_count); @@ -1958,7 +1958,7 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs) cpu_to_le32(dev_segs + segs); } - up_write(&sbi->sb_lock); + f2fs_up_write(&sbi->sb_lock); } static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs) @@ -2032,7 +2032,7 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) secs = div_u64(shrunk_blocks, BLKS_PER_SEC(sbi)); /* stop other GC */ - if (!down_write_trylock(&sbi->gc_lock)) + if (!f2fs_down_write_trylock(&sbi->gc_lock)) return -EAGAIN; /* stop CP to protect MAIN_SEC in free_segment_range */ @@ -2052,15 +2052,15 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) out_unlock: f2fs_unlock_op(sbi); - up_write(&sbi->gc_lock); + f2fs_up_write(&sbi->gc_lock); if (err) return err; set_sbi_flag(sbi, SBI_IS_RESIZEFS); freeze_super(sbi->sb); - down_write(&sbi->gc_lock); - down_write(&sbi->cp_global_sem); + f2fs_down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->cp_global_sem); spin_lock(&sbi->stat_lock); if (shrunk_blocks + valid_user_blocks(sbi) + @@ -2105,8 +2105,8 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) spin_unlock(&sbi->stat_lock); } out_err: - up_write(&sbi->cp_global_sem); - up_write(&sbi->gc_lock); + f2fs_up_write(&sbi->cp_global_sem); + f2fs_up_write(&sbi->gc_lock); thaw_super(sbi->sb); clear_sbi_flag(sbi, SBI_IS_RESIZEFS); return err; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 9f7a27e149bf..4221c83c92d9 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -628,7 +628,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname, } if (inode) { - down_write(&F2FS_I(inode)->i_sem); + f2fs_down_write(&F2FS_I(inode)->i_sem); page = f2fs_init_inode_metadata(inode, dir, fname, ipage); if (IS_ERR(page)) { err = PTR_ERR(page); @@ -657,7 +657,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname, f2fs_update_parent_metadata(dir, inode, 0); fail: if (inode) - up_write(&F2FS_I(inode)->i_sem); + f2fs_up_write(&F2FS_I(inode)->i_sem); out: f2fs_put_page(ipage, 1); return err; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 8c7ade66e857..b025055df431 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -191,7 +191,7 @@ static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode * __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; int i, cold_count, hot_count; - down_read(&sbi->sb_lock); + f2fs_down_read(&sbi->sb_lock); cold_count = le32_to_cpu(sbi->raw_super->extension_count); hot_count = sbi->raw_super->hot_ext_count; @@ -201,7 +201,7 @@ static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode * break; } - up_read(&sbi->sb_lock); + f2fs_up_read(&sbi->sb_lock); if (i == cold_count + hot_count) return; @@ -294,19 +294,19 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode, (!ext_cnt && !noext_cnt)) return; - down_read(&sbi->sb_lock); + f2fs_down_read(&sbi->sb_lock); cold_count = le32_to_cpu(sbi->raw_super->extension_count); hot_count = sbi->raw_super->hot_ext_count; for (i = cold_count; i < cold_count + hot_count; i++) { if (is_extension_exist(name, extlist[i], false)) { - up_read(&sbi->sb_lock); + f2fs_up_read(&sbi->sb_lock); return; } } - up_read(&sbi->sb_lock); + f2fs_up_read(&sbi->sb_lock); for (i = 0; i < noext_cnt; i++) { if (is_extension_exist(name, noext[i], false)) { @@ -1017,11 +1017,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, new_page = NULL; new_inode->i_ctime = current_time(new_inode); - down_write(&F2FS_I(new_inode)->i_sem); + f2fs_down_write(&F2FS_I(new_inode)->i_sem); if (old_dir_entry) f2fs_i_links_write(new_inode, false); f2fs_i_links_write(new_inode, false); - up_write(&F2FS_I(new_inode)->i_sem); + f2fs_up_write(&F2FS_I(new_inode)->i_sem); if (!new_inode->i_nlink) f2fs_add_orphan_inode(new_inode); @@ -1042,13 +1042,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_i_links_write(new_dir, true); } - down_write(&F2FS_I(old_inode)->i_sem); + f2fs_down_write(&F2FS_I(old_inode)->i_sem); if (!old_dir_entry || whiteout) file_lost_pino(old_inode); else /* adjust dir's i_pino to pass fsck check */ f2fs_i_pino_write(old_inode, new_dir->i_ino); - up_write(&F2FS_I(old_inode)->i_sem); + f2fs_up_write(&F2FS_I(old_inode)->i_sem); old_inode->i_ctime = current_time(old_inode); f2fs_mark_inode_dirty_sync(old_inode, false); @@ -1208,38 +1208,38 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, /* update directory entry info of old dir inode */ f2fs_set_link(old_dir, old_entry, old_page, new_inode); - down_write(&F2FS_I(old_inode)->i_sem); + f2fs_down_write(&F2FS_I(old_inode)->i_sem); if (!old_dir_entry) file_lost_pino(old_inode); else /* adjust dir's i_pino to pass fsck check */ f2fs_i_pino_write(old_inode, new_dir->i_ino); - up_write(&F2FS_I(old_inode)->i_sem); + f2fs_up_write(&F2FS_I(old_inode)->i_sem); old_dir->i_ctime = current_time(old_dir); if (old_nlink) { - down_write(&F2FS_I(old_dir)->i_sem); + f2fs_down_write(&F2FS_I(old_dir)->i_sem); f2fs_i_links_write(old_dir, old_nlink > 0); - up_write(&F2FS_I(old_dir)->i_sem); + f2fs_up_write(&F2FS_I(old_dir)->i_sem); } f2fs_mark_inode_dirty_sync(old_dir, false); /* update directory entry info of new dir inode */ f2fs_set_link(new_dir, new_entry, new_page, old_inode); - down_write(&F2FS_I(new_inode)->i_sem); + f2fs_down_write(&F2FS_I(new_inode)->i_sem); if (!new_dir_entry) file_lost_pino(new_inode); else /* adjust dir's i_pino to pass fsck check */ f2fs_i_pino_write(new_inode, old_dir->i_ino); - up_write(&F2FS_I(new_inode)->i_sem); + f2fs_up_write(&F2FS_I(new_inode)->i_sem); new_dir->i_ctime = current_time(new_dir); if (new_nlink) { - down_write(&F2FS_I(new_dir)->i_sem); + f2fs_down_write(&F2FS_I(new_dir)->i_sem); f2fs_i_links_write(new_dir, new_nlink > 0); - up_write(&F2FS_I(new_dir)->i_sem); + f2fs_up_write(&F2FS_I(new_dir)->i_sem); } f2fs_mark_inode_dirty_sync(new_dir, false); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 62c8c246890f..56d08c6caca0 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -382,14 +382,14 @@ int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid) struct nat_entry *e; bool need = false; - down_read(&nm_i->nat_tree_lock); + f2fs_down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (e) { if (!get_nat_flag(e, IS_CHECKPOINTED) && !get_nat_flag(e, HAS_FSYNCED_INODE)) need = true; } - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); return need; } @@ -399,11 +399,11 @@ bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) struct nat_entry *e; bool is_cp = true; - down_read(&nm_i->nat_tree_lock); + f2fs_down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (e && !get_nat_flag(e, IS_CHECKPOINTED)) is_cp = false; - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); return is_cp; } @@ -413,13 +413,13 @@ bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) struct nat_entry *e; bool need_update = true; - down_read(&nm_i->nat_tree_lock); + f2fs_down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ino); if (e && get_nat_flag(e, HAS_LAST_FSYNC) && (get_nat_flag(e, IS_CHECKPOINTED) || get_nat_flag(e, HAS_FSYNCED_INODE))) need_update = false; - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); return need_update; } @@ -431,14 +431,14 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, struct nat_entry *new, *e; /* Let's mitigate lock contention of nat_tree_lock during checkpoint */ - if (rwsem_is_locked(&sbi->cp_global_sem)) + if (f2fs_rwsem_is_locked(&sbi->cp_global_sem)) return; new = __alloc_nat_entry(sbi, nid, false); if (!new) return; - down_write(&nm_i->nat_tree_lock); + f2fs_down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (!e) e = __init_nat_entry(nm_i, new, ne, false); @@ -447,7 +447,7 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, nat_get_blkaddr(e) != le32_to_cpu(ne->block_addr) || nat_get_version(e) != ne->version); - up_write(&nm_i->nat_tree_lock); + f2fs_up_write(&nm_i->nat_tree_lock); if (e != new) __free_nat_entry(new); } @@ -459,7 +459,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, struct nat_entry *e; struct nat_entry *new = __alloc_nat_entry(sbi, ni->nid, true); - down_write(&nm_i->nat_tree_lock); + f2fs_down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ni->nid); if (!e) { e = __init_nat_entry(nm_i, new, NULL, true); @@ -508,7 +508,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, set_nat_flag(e, HAS_FSYNCED_INODE, true); set_nat_flag(e, HAS_LAST_FSYNC, fsync_done); } - up_write(&nm_i->nat_tree_lock); + f2fs_up_write(&nm_i->nat_tree_lock); } int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) @@ -516,7 +516,7 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) struct f2fs_nm_info *nm_i = NM_I(sbi); int nr = nr_shrink; - if (!down_write_trylock(&nm_i->nat_tree_lock)) + if (!f2fs_down_write_trylock(&nm_i->nat_tree_lock)) return 0; spin_lock(&nm_i->nat_list_lock); @@ -538,7 +538,7 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) } spin_unlock(&nm_i->nat_list_lock); - up_write(&nm_i->nat_tree_lock); + f2fs_up_write(&nm_i->nat_tree_lock); return nr - nr_shrink; } @@ -560,13 +560,13 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, ni->nid = nid; retry: /* Check nat cache */ - down_read(&nm_i->nat_tree_lock); + f2fs_down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (e) { ni->ino = nat_get_ino(e); ni->blk_addr = nat_get_blkaddr(e); ni->version = nat_get_version(e); - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); return 0; } @@ -576,11 +576,11 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, * nat_tree_lock. Therefore, we should retry, if we failed to grab here * while not bothering checkpoint. */ - if (!rwsem_is_locked(&sbi->cp_global_sem) || checkpoint_context) { + if (!f2fs_rwsem_is_locked(&sbi->cp_global_sem) || checkpoint_context) { down_read(&curseg->journal_rwsem); - } else if (rwsem_is_contended(&nm_i->nat_tree_lock) || + } else if (f2fs_rwsem_is_contended(&nm_i->nat_tree_lock) || !down_read_trylock(&curseg->journal_rwsem)) { - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); goto retry; } @@ -589,15 +589,15 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, ne = nat_in_journal(journal, i); node_info_from_raw_nat(ni, &ne); } - up_read(&curseg->journal_rwsem); + up_read(&curseg->journal_rwsem); if (i >= 0) { - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); goto cache; } /* Fill node_info from nat page */ index = current_nat_addr(sbi, nid); - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); page = f2fs_get_meta_page(sbi, index); if (IS_ERR(page)) @@ -1609,17 +1609,17 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, goto redirty_out; if (wbc->for_reclaim) { - if (!down_read_trylock(&sbi->node_write)) + if (!f2fs_down_read_trylock(&sbi->node_write)) goto redirty_out; } else { - down_read(&sbi->node_write); + f2fs_down_read(&sbi->node_write); } /* This page is already truncated */ if (unlikely(ni.blk_addr == NULL_ADDR)) { ClearPageUptodate(page); dec_page_count(sbi, F2FS_DIRTY_NODES); - up_read(&sbi->node_write); + f2fs_up_read(&sbi->node_write); unlock_page(page); return 0; } @@ -1627,7 +1627,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, if (__is_valid_data_blkaddr(ni.blk_addr) && !f2fs_is_valid_blkaddr(sbi, ni.blk_addr, DATA_GENERIC_ENHANCE)) { - up_read(&sbi->node_write); + f2fs_up_read(&sbi->node_write); goto redirty_out; } @@ -1648,7 +1648,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, f2fs_do_write_node_page(nid, &fio); set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); dec_page_count(sbi, F2FS_DIRTY_NODES); - up_read(&sbi->node_write); + f2fs_up_read(&sbi->node_write); if (wbc->for_reclaim) { f2fs_submit_merged_write_cond(sbi, NULL, page, 0, NODE); @@ -2225,14 +2225,14 @@ bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi) unsigned int i; bool ret = true; - down_read(&nm_i->nat_tree_lock); + f2fs_down_read(&nm_i->nat_tree_lock); for (i = 0; i < nm_i->nat_blocks; i++) { if (!test_bit_le(i, nm_i->nat_block_bitmap)) { ret = false; break; } } - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); return ret; } @@ -2415,7 +2415,7 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi) unsigned int i, idx; nid_t nid; - down_read(&nm_i->nat_tree_lock); + f2fs_down_read(&nm_i->nat_tree_lock); for (i = 0; i < nm_i->nat_blocks; i++) { if (!test_bit_le(i, nm_i->nat_block_bitmap)) @@ -2438,7 +2438,7 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi) out: scan_curseg_cache(sbi); - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); } static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi, @@ -2473,7 +2473,7 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi, f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT, true); - down_read(&nm_i->nat_tree_lock); + f2fs_down_read(&nm_i->nat_tree_lock); while (1) { if (!test_bit_le(NAT_BLOCK_OFFSET(nid), @@ -2488,7 +2488,7 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi, } if (ret) { - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); f2fs_err(sbi, "NAT is corrupt, run fsck to fix it"); return ret; } @@ -2508,7 +2508,7 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi, /* find free nids from current sum_pages */ scan_curseg_cache(sbi); - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), nm_i->ra_nid_pages, META_NAT, false); @@ -2953,7 +2953,7 @@ void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi) struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned int nat_ofs; - down_read(&nm_i->nat_tree_lock); + f2fs_down_read(&nm_i->nat_tree_lock); for (nat_ofs = 0; nat_ofs < nm_i->nat_blocks; nat_ofs++) { unsigned int valid = 0, nid_ofs = 0; @@ -2973,7 +2973,7 @@ void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi) __update_nat_bits(nm_i, nat_ofs, valid); } - up_read(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); } static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, @@ -3071,15 +3071,15 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * nat_cnt[DIRTY_NAT]. */ if (cpc->reason & CP_UMOUNT) { - down_write(&nm_i->nat_tree_lock); + f2fs_down_write(&nm_i->nat_tree_lock); remove_nats_in_journal(sbi); - up_write(&nm_i->nat_tree_lock); + f2fs_up_write(&nm_i->nat_tree_lock); } if (!nm_i->nat_cnt[DIRTY_NAT]) return 0; - down_write(&nm_i->nat_tree_lock); + f2fs_down_write(&nm_i->nat_tree_lock); /* * if there are no enough space in journal to store dirty nat @@ -3108,7 +3108,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) break; } - up_write(&nm_i->nat_tree_lock); + f2fs_up_write(&nm_i->nat_tree_lock); /* Allow dirty nats by node block allocation in write_begin */ return err; @@ -3228,7 +3228,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) mutex_init(&nm_i->build_lock); spin_lock_init(&nm_i->nid_list_lock); - init_rwsem(&nm_i->nat_tree_lock); + init_f2fs_rwsem(&nm_i->nat_tree_lock); nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP); @@ -3334,7 +3334,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) spin_unlock(&nm_i->nid_list_lock); /* destroy nat cache */ - down_write(&nm_i->nat_tree_lock); + f2fs_down_write(&nm_i->nat_tree_lock); while ((found = __gang_lookup_nat_cache(nm_i, nid, NATVEC_SIZE, natvec))) { unsigned idx; @@ -3364,7 +3364,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) kmem_cache_free(nat_entry_set_slab, setvec[idx]); } } - up_write(&nm_i->nat_tree_lock); + f2fs_up_write(&nm_i->nat_tree_lock); kvfree(nm_i->nat_block_bitmap); if (nm_i->free_nid_bitmap) { diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index f6d12872fde3..d33e63805e11 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -795,7 +795,7 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) INIT_LIST_HEAD(&dir_list); /* prevent checkpoint */ - down_write(&sbi->cp_global_sem); + f2fs_down_write(&sbi->cp_global_sem); /* step #1: find fsynced inode numbers */ err = find_fsync_dnodes(sbi, &inode_list, check_only); @@ -829,7 +829,7 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) } else { clear_sbi_flag(sbi, SBI_POR_DOING); } - up_write(&sbi->cp_global_sem); + f2fs_up_write(&sbi->cp_global_sem); /* let's drop all the directory inodes for clean checkpoint */ destroy_fsync_dnodes(&dir_list, err); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7564fa290647..877389a5fa92 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -474,7 +474,7 @@ int f2fs_commit_inmem_pages(struct inode *inode) f2fs_balance_fs(sbi, true); - down_write(&fi->i_gc_rwsem[WRITE]); + f2fs_down_write(&fi->i_gc_rwsem[WRITE]); f2fs_lock_op(sbi); set_inode_flag(inode, FI_ATOMIC_COMMIT); @@ -486,7 +486,7 @@ int f2fs_commit_inmem_pages(struct inode *inode) clear_inode_flag(inode, FI_ATOMIC_COMMIT); f2fs_unlock_op(sbi); - up_write(&fi->i_gc_rwsem[WRITE]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); return err; } @@ -524,7 +524,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) io_schedule(); finish_wait(&sbi->gc_thread->fggc_wq, &wait); } else { - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); f2fs_gc(sbi, false, false, false, NULL_SEGNO); } } @@ -532,7 +532,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) static inline bool excess_dirty_threshold(struct f2fs_sb_info *sbi) { - int factor = rwsem_is_locked(&sbi->cp_rwsem) ? 3 : 2; + int factor = f2fs_rwsem_is_locked(&sbi->cp_rwsem) ? 3 : 2; unsigned int dents = get_pages(sbi, F2FS_DIRTY_DENTS); unsigned int qdata = get_pages(sbi, F2FS_DIRTY_QDATA); unsigned int nodes = get_pages(sbi, F2FS_DIRTY_NODES); @@ -573,7 +573,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) /* there is background inflight IO or foreground operation recently */ if (is_inflight_io(sbi, REQ_TIME) || - (!f2fs_time_over(sbi, REQ_TIME) && rwsem_is_locked(&sbi->cp_rwsem))) + (!f2fs_time_over(sbi, REQ_TIME) && f2fs_rwsem_is_locked(&sbi->cp_rwsem))) return; /* exceed periodical checkpoint timeout threshold */ @@ -2823,7 +2823,7 @@ static void __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi) if (!sbi->am.atgc_enabled) return; - down_read(&SM_I(sbi)->curseg_lock); + f2fs_down_read(&SM_I(sbi)->curseg_lock); mutex_lock(&curseg->curseg_mutex); down_write(&SIT_I(sbi)->sentry_lock); @@ -2833,7 +2833,7 @@ static void __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi) up_write(&SIT_I(sbi)->sentry_lock); mutex_unlock(&curseg->curseg_mutex); - up_read(&SM_I(sbi)->curseg_lock); + f2fs_up_read(&SM_I(sbi)->curseg_lock); } void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi) @@ -2984,7 +2984,7 @@ void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int segno; - down_read(&SM_I(sbi)->curseg_lock); + f2fs_down_read(&SM_I(sbi)->curseg_lock); mutex_lock(&curseg->curseg_mutex); down_write(&SIT_I(sbi)->sentry_lock); @@ -3008,7 +3008,7 @@ void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, type, segno, curseg->segno); mutex_unlock(&curseg->curseg_mutex); - up_read(&SM_I(sbi)->curseg_lock); + f2fs_up_read(&SM_I(sbi)->curseg_lock); } static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type, @@ -3040,23 +3040,23 @@ static void __allocate_new_section(struct f2fs_sb_info *sbi, void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force) { - down_read(&SM_I(sbi)->curseg_lock); + f2fs_down_read(&SM_I(sbi)->curseg_lock); down_write(&SIT_I(sbi)->sentry_lock); __allocate_new_section(sbi, type, force); up_write(&SIT_I(sbi)->sentry_lock); - up_read(&SM_I(sbi)->curseg_lock); + f2fs_up_read(&SM_I(sbi)->curseg_lock); } void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) { int i; - down_read(&SM_I(sbi)->curseg_lock); + f2fs_down_read(&SM_I(sbi)->curseg_lock); down_write(&SIT_I(sbi)->sentry_lock); for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) __allocate_new_segment(sbi, i, false, false); up_write(&SIT_I(sbi)->sentry_lock); - up_read(&SM_I(sbi)->curseg_lock); + f2fs_up_read(&SM_I(sbi)->curseg_lock); } static const struct segment_allocation default_salloc_ops = { @@ -3194,9 +3194,9 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) if (sbi->discard_blks == 0) goto out; - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); err = f2fs_write_checkpoint(sbi, &cpc); - up_write(&sbi->gc_lock); + f2fs_up_write(&sbi->gc_lock); if (err) goto out; @@ -3433,7 +3433,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, bool from_gc = (type == CURSEG_ALL_DATA_ATGC); struct seg_entry *se = NULL; - down_read(&SM_I(sbi)->curseg_lock); + f2fs_down_read(&SM_I(sbi)->curseg_lock); mutex_lock(&curseg->curseg_mutex); down_write(&sit_i->sentry_lock); @@ -3516,7 +3516,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, mutex_unlock(&curseg->curseg_mutex); - up_read(&SM_I(sbi)->curseg_lock); + f2fs_up_read(&SM_I(sbi)->curseg_lock); } void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino, @@ -3552,7 +3552,7 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) bool keep_order = (f2fs_lfs_mode(fio->sbi) && type == CURSEG_COLD_DATA); if (keep_order) - down_read(&fio->sbi->io_order_lock); + f2fs_down_read(&fio->sbi->io_order_lock); reallocate: f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, &fio->new_blkaddr, sum, type, fio); @@ -3572,7 +3572,7 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) f2fs_update_device_state(fio->sbi, fio->ino, fio->new_blkaddr, 1); if (keep_order) - up_read(&fio->sbi->io_order_lock); + f2fs_up_read(&fio->sbi->io_order_lock); } void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page, @@ -3707,7 +3707,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, se = get_seg_entry(sbi, segno); type = se->type; - down_write(&SM_I(sbi)->curseg_lock); + f2fs_down_write(&SM_I(sbi)->curseg_lock); if (!recover_curseg) { /* for recovery flow */ @@ -3776,7 +3776,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, up_write(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); - up_write(&SM_I(sbi)->curseg_lock); + f2fs_up_write(&SM_I(sbi)->curseg_lock); } void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, @@ -4905,7 +4905,7 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&sm_info->sit_entry_set); - init_rwsem(&sm_info->curseg_lock); + init_f2fs_rwsem(&sm_info->curseg_lock); if (!f2fs_readonly(sbi->sb)) { err = f2fs_create_flush_cmd_control(sbi); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 94c5166b5a31..54c129050b60 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1344,17 +1344,17 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) /* Initialize f2fs-specific inode info */ atomic_set(&fi->dirty_pages, 0); atomic_set(&fi->i_compr_blocks, 0); - init_rwsem(&fi->i_sem); + init_f2fs_rwsem(&fi->i_sem); spin_lock_init(&fi->i_size_lock); INIT_LIST_HEAD(&fi->dirty_list); INIT_LIST_HEAD(&fi->gdirty_list); INIT_LIST_HEAD(&fi->inmem_ilist); INIT_LIST_HEAD(&fi->inmem_pages); mutex_init(&fi->inmem_lock); - init_rwsem(&fi->i_gc_rwsem[READ]); - init_rwsem(&fi->i_gc_rwsem[WRITE]); - init_rwsem(&fi->i_mmap_sem); - init_rwsem(&fi->i_xattr_sem); + init_f2fs_rwsem(&fi->i_gc_rwsem[READ]); + init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]); + init_f2fs_rwsem(&fi->i_mmap_sem); + init_f2fs_rwsem(&fi->i_xattr_sem); /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; @@ -2083,7 +2083,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) f2fs_update_time(sbi, DISABLE_TIME); while (!f2fs_time_over(sbi, DISABLE_TIME)) { - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); if (err == -ENODATA) { err = 0; @@ -2105,7 +2105,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) goto restore_flag; } - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); cpc.reason = CP_PAUSE; set_sbi_flag(sbi, SBI_CP_DISABLED); err = f2fs_write_checkpoint(sbi, &cpc); @@ -2117,7 +2117,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) spin_unlock(&sbi->stat_lock); out_unlock: - up_write(&sbi->gc_lock); + f2fs_up_write(&sbi->gc_lock); restore_flag: sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ return err; @@ -2137,12 +2137,12 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) if (unlikely(retry < 0)) f2fs_warn(sbi, "checkpoint=enable has some unwritten data."); - down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->gc_lock); f2fs_dirty_to_prefree(sbi); clear_sbi_flag(sbi, SBI_CP_DISABLED); set_sbi_flag(sbi, SBI_IS_DIRTY); - up_write(&sbi->gc_lock); + f2fs_up_write(&sbi->gc_lock); f2fs_sync_fs(sbi->sb, 1); } @@ -2703,18 +2703,18 @@ int f2fs_quota_sync(struct super_block *sb, int type) /* * do_quotactl * f2fs_quota_sync - * down_read(quota_sem) + * f2fs_down_read(quota_sem) * dquot_writeback_dquots() * f2fs_dquot_commit * block_operation - * down_read(quota_sem) + * f2fs_down_read(quota_sem) */ f2fs_lock_op(sbi); - down_read(&sbi->quota_sem); + f2fs_down_read(&sbi->quota_sem); ret = f2fs_quota_sync_file(sbi, cnt); - up_read(&sbi->quota_sem); + f2fs_up_read(&sbi->quota_sem); f2fs_unlock_op(sbi); inode_unlock(dqopt->files[cnt]); @@ -2839,11 +2839,11 @@ static int f2fs_dquot_commit(struct dquot *dquot) struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb); int ret; - down_read_nested(&sbi->quota_sem, SINGLE_DEPTH_NESTING); + f2fs_down_read_nested(&sbi->quota_sem, SINGLE_DEPTH_NESTING); ret = dquot_commit(dquot); if (ret < 0) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); - up_read(&sbi->quota_sem); + f2fs_up_read(&sbi->quota_sem); return ret; } @@ -2852,11 +2852,11 @@ static int f2fs_dquot_acquire(struct dquot *dquot) struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb); int ret; - down_read(&sbi->quota_sem); + f2fs_down_read(&sbi->quota_sem); ret = dquot_acquire(dquot); if (ret < 0) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); - up_read(&sbi->quota_sem); + f2fs_up_read(&sbi->quota_sem); return ret; } @@ -3577,14 +3577,14 @@ static void init_sb_info(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); - init_rwsem(&sbi->io_order_lock); + init_f2fs_rwsem(&sbi->io_order_lock); spin_lock_init(&sbi->cp_lock); sbi->dirty_device = 0; spin_lock_init(&sbi->dev_lock); - init_rwsem(&sbi->sb_lock); - init_rwsem(&sbi->pin_sem); + init_f2fs_rwsem(&sbi->sb_lock); + init_f2fs_rwsem(&sbi->pin_sem); } static int init_percpu_info(struct f2fs_sb_info *sbi) @@ -4029,11 +4029,11 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) /* init f2fs-specific super block info */ sbi->valid_super_block = valid_super_block; - init_rwsem(&sbi->gc_lock); + init_f2fs_rwsem(&sbi->gc_lock); mutex_init(&sbi->writepages); - init_rwsem(&sbi->cp_global_sem); - init_rwsem(&sbi->node_write); - init_rwsem(&sbi->node_change); + init_f2fs_rwsem(&sbi->cp_global_sem); + init_f2fs_rwsem(&sbi->node_write); + init_f2fs_rwsem(&sbi->node_change); /* disallow all the data/node/meta page writes */ set_sbi_flag(sbi, SBI_POR_DOING); @@ -4054,18 +4054,18 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) } for (j = HOT; j < n; j++) { - init_rwsem(&sbi->write_io[i][j].io_rwsem); + init_f2fs_rwsem(&sbi->write_io[i][j].io_rwsem); sbi->write_io[i][j].sbi = sbi; sbi->write_io[i][j].bio = NULL; spin_lock_init(&sbi->write_io[i][j].io_lock); INIT_LIST_HEAD(&sbi->write_io[i][j].io_list); INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list); - init_rwsem(&sbi->write_io[i][j].bio_list_lock); + init_f2fs_rwsem(&sbi->write_io[i][j].bio_list_lock); } } - init_rwsem(&sbi->cp_rwsem); - init_rwsem(&sbi->quota_sem); + init_f2fs_rwsem(&sbi->cp_rwsem); + init_f2fs_rwsem(&sbi->quota_sem); init_waitqueue_head(&sbi->cp_wait); init_sb_info(sbi); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 3491fa294ce2..73f16de45f12 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -367,7 +367,7 @@ static ssize_t __sbi_store(struct f2fs_attr *a, if (!strlen(name) || strlen(name) >= F2FS_EXTENSION_LEN) return -EINVAL; - down_write(&sbi->sb_lock); + f2fs_down_write(&sbi->sb_lock); ret = f2fs_update_extension_list(sbi, name, hot, set); if (ret) @@ -377,7 +377,7 @@ static ssize_t __sbi_store(struct f2fs_attr *a, if (ret) f2fs_update_extension_list(sbi, name, hot, !set); out: - up_write(&sbi->sb_lock); + f2fs_up_write(&sbi->sb_lock); return ret ? ret : count; } diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index f5e9a5e2cb88..074b9f415822 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -208,7 +208,7 @@ static int f2fs_end_enable_verity(struct file *filp, const void *desc, * from re-instantiating cached pages we are truncating (since unlike * normal file accesses, garbage collection isn't limited by i_size). */ - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); truncate_inode_pages(inode->i_mapping, inode->i_size); err2 = f2fs_truncate(inode); if (err2) { @@ -216,7 +216,7 @@ static int f2fs_end_enable_verity(struct file *filp, const void *desc, err2); set_sbi_flag(sbi, SBI_NEED_FSCK); } - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); clear_inode_flag(inode, FI_VERITY_IN_PROGRESS); return err ?: err2; } diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index d125f8c61bf5..1cd844a8e38a 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -523,10 +523,10 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, if (len > F2FS_NAME_LEN) return -ERANGE; - down_read(&F2FS_I(inode)->i_xattr_sem); + f2fs_down_read(&F2FS_I(inode)->i_xattr_sem); error = lookup_all_xattrs(inode, ipage, index, len, name, &entry, &base_addr, &base_size, &is_inline); - up_read(&F2FS_I(inode)->i_xattr_sem); + f2fs_up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; @@ -560,9 +560,9 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) int error; size_t rest = buffer_size; - down_read(&F2FS_I(inode)->i_xattr_sem); + f2fs_down_read(&F2FS_I(inode)->i_xattr_sem); error = read_all_xattrs(inode, NULL, &base_addr); - up_read(&F2FS_I(inode)->i_xattr_sem); + f2fs_up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; @@ -784,9 +784,9 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); - down_write(&F2FS_I(inode)->i_xattr_sem); + f2fs_down_write(&F2FS_I(inode)->i_xattr_sem); err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags); - up_write(&F2FS_I(inode)->i_xattr_sem); + f2fs_up_write(&F2FS_I(inode)->i_xattr_sem); f2fs_unlock_op(sbi); f2fs_update_time(sbi, REQ_TIME); From 9c2a619bf565bf3eb9891ad49f925535cd9730f7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 18 Jan 2022 11:48:02 +0800 Subject: [PATCH 367/580] f2fs: fix to enable ATGC correctly via gc_idle sysfs interface It needs to assign sbi->gc_mode with GC_IDLE_AT rather than GC_AT when user tries to enable ATGC via gc_idle sysfs interface, fix it. Fixes: 093749e296e2 ("f2fs: support age threshold based garbage collection") Cc: Zhipeng Tan Signed-off-by: Jicheng Shao Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 73f16de45f12..2631a4a12540 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -485,7 +485,7 @@ static ssize_t __sbi_store(struct f2fs_attr *a, } else if (t == GC_IDLE_AT) { if (!sbi->am.atgc_enabled) return -EINVAL; - sbi->gc_mode = GC_AT; + sbi->gc_mode = GC_IDLE_AT; } else { sbi->gc_mode = GC_NORMAL; } From 45ee1be16ec1ee7df0f0d69cc6e33b4ff82bf0d3 Mon Sep 17 00:00:00 2001 From: Konstantin Vyshetsky Date: Mon, 13 Dec 2021 17:12:03 -0800 Subject: [PATCH 368/580] f2fs: move discard parameters into discard_cmd_control This patch unifies parameters related to how often discard is issued and how many requests go out at the same time by placing them in discard_cmd_control. The move will allow the parameters to be modified in the future without relying on hard-coded values. Signed-off-by: Konstantin Vyshetsky Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 ++++ fs/f2fs/segment.c | 22 +++++++++++++--------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 990f3a70c9e2..2f2a3173f2d4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -395,6 +395,10 @@ struct discard_cmd_control { struct mutex cmd_lock; unsigned int nr_discards; /* # of discards in the list */ unsigned int max_discards; /* max. discards to be issued */ + unsigned int max_discard_request; /* max. discard request per round */ + unsigned int min_discard_issue_time; /* min. interval between discard issue */ + unsigned int mid_discard_issue_time; /* mid. interval between discard issue */ + unsigned int max_discard_issue_time; /* max. interval between discard issue */ unsigned int discard_granularity; /* discard granularity */ unsigned int undiscard_blks; /* # of undiscard blocks */ unsigned int next_pos; /* next discard position */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 877389a5fa92..b3847165c65e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1159,14 +1159,14 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->ordered = false; dpolicy->granularity = granularity; - dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; + dpolicy->max_requests = dcc->max_discard_request; dpolicy->io_aware_gran = MAX_PLIST_NUM; dpolicy->timeout = false; if (discard_type == DPOLICY_BG) { - dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; - dpolicy->mid_interval = DEF_MID_DISCARD_ISSUE_TIME; - dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; + dpolicy->min_interval = dcc->min_discard_issue_time; + dpolicy->mid_interval = dcc->mid_discard_issue_time; + dpolicy->max_interval = dcc->max_discard_issue_time; dpolicy->io_aware = true; dpolicy->sync = false; dpolicy->ordered = true; @@ -1174,12 +1174,12 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->granularity = 1; if (atomic_read(&dcc->discard_cmd_cnt)) dpolicy->max_interval = - DEF_MIN_DISCARD_ISSUE_TIME; + dcc->min_discard_issue_time; } } else if (discard_type == DPOLICY_FORCE) { - dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; - dpolicy->mid_interval = DEF_MID_DISCARD_ISSUE_TIME; - dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; + dpolicy->min_interval = dcc->min_discard_issue_time; + dpolicy->mid_interval = dcc->mid_discard_issue_time; + dpolicy->max_interval = dcc->max_discard_issue_time; dpolicy->io_aware = false; } else if (discard_type == DPOLICY_FSTRIM) { dpolicy->io_aware = false; @@ -1784,7 +1784,7 @@ static int issue_discard_thread(void *data) struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; wait_queue_head_t *q = &dcc->discard_wait_queue; struct discard_policy dpolicy; - unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; + unsigned int wait_ms = dcc->min_discard_issue_time; int issued; set_freezable(); @@ -2182,6 +2182,10 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) atomic_set(&dcc->discard_cmd_cnt, 0); dcc->nr_discards = 0; dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg; + dcc->max_discard_request = DEF_MAX_DISCARD_REQUEST; + dcc->min_discard_issue_time = DEF_MIN_DISCARD_ISSUE_TIME; + dcc->mid_discard_issue_time = DEF_MID_DISCARD_ISSUE_TIME; + dcc->max_discard_issue_time = DEF_MAX_DISCARD_ISSUE_TIME; dcc->undiscard_blks = 0; dcc->next_pos = 0; dcc->root = RB_ROOT_CACHED; From 003c72fe50ecfa17c5a667d5de50253160c0b749 Mon Sep 17 00:00:00 2001 From: Konstantin Vyshetsky Date: Mon, 13 Dec 2021 17:12:43 -0800 Subject: [PATCH 369/580] f2fs: expose discard related parameters in sysfs This patch exposes max_discard_request, min_discard_issue_time, mid_discard_issue_time, and max_discard_issue_time in sysfs. This will allow the user to fine tune discard operations. Signed-off-by: Konstantin Vyshetsky Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 27 +++++++++++++++++++++++++ fs/f2fs/sysfs.c | 8 ++++++++ 2 files changed, 35 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index d033fa7de9f2..362014fd8290 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -88,6 +88,33 @@ Description: Controls the issue rate of discard commands that consist of small checkpoint is triggered, and issued during the checkpoint. By default, it is disabled with 0. +What: /sys/fs/f2fs//max_discard_request +Date: December 2021 +Contact: "Konstantin Vyshetsky" +Description: Controls the number of discards a thread will issue at a time. + Higher number will allow the discard thread to finish its work + faster, at the cost of higher latency for incomming I/O. + +What: /sys/fs/f2fs//min_discard_issue_time +Date: December 2021 +Contact: "Konstantin Vyshetsky" +Description: Controls the interval the discard thread will wait between + issuing discard requests when there are discards to be issued and + no I/O aware interruptions occur. + +What: /sys/fs/f2fs//mid_discard_issue_time +Date: December 2021 +Contact: "Konstantin Vyshetsky" +Description: Controls the interval the discard thread will wait between + issuing discard requests when there are discards to be issued and + an I/O aware interruption occurs. + +What: /sys/fs/f2fs//max_discard_issue_time +Date: December 2021 +Contact: "Konstantin Vyshetsky" +Description: Controls the interval the discard thread will wait when there are + no discard operations to be issued. + What: /sys/fs/f2fs//discard_granularity Date: July 2017 Contact: "Chao Yu" diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 2631a4a12540..1b13d741fe02 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -712,6 +712,10 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle, gc_mode); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent, gc_mode); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_request, max_discard_request); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, min_discard_issue_time, min_discard_issue_time); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_issue_time); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); @@ -827,6 +831,10 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(reclaim_segments), ATTR_LIST(main_blkaddr), ATTR_LIST(max_small_discards), + ATTR_LIST(max_discard_request), + ATTR_LIST(min_discard_issue_time), + ATTR_LIST(mid_discard_issue_time), + ATTR_LIST(max_discard_issue_time), ATTR_LIST(discard_granularity), ATTR_LIST(pending_discard), ATTR_LIST(batched_trim_sections), From 7e5b66d44864c9138d39ca2b7ee830f3a759bd6a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 4 Feb 2022 11:21:14 +0800 Subject: [PATCH 370/580] f2fs: fix to unlock page correctly in error path of is_alive() As Pavel Machek reported in below link [1]: After commit 77900c45ee5c ("f2fs: fix to do sanity check in is_alive()"), node page should be unlock via calling f2fs_put_page() in the error path of is_alive(), otherwise, f2fs may hang when it tries to lock the node page, fix it. [1] https://lore.kernel.org/stable/20220124203637.GA19321@duo.ucw.cz/ Fixes: 77900c45ee5c ("f2fs: fix to do sanity check in is_alive()") Cc: Reported-by: Pavel Machek Signed-off-by: Pavel Machek Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 7a486bbf9c07..8ab4b40c12c5 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1038,8 +1038,10 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, set_sbi_flag(sbi, SBI_NEED_FSCK); } - if (f2fs_check_nid_range(sbi, dni->ino)) + if (f2fs_check_nid_range(sbi, dni->ino)) { + f2fs_put_page(node_page, 1); return false; + } *nofs = ofs_of_node(node_page); source_blkaddr = data_blkaddr(NULL, node_page, ofs_in_node); From 03dd04e8c8e96dc012fa46f3e8c4df33e28eeab3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 4 Feb 2022 08:34:10 +0800 Subject: [PATCH 371/580] f2fs: adjust readahead block number during recovery In a fragmented image, entries in dnode block list may locate in incontiguous physical block address space, however, in recovery flow, we will always readahead BIO_MAX_VECS size blocks, so in such case, current readahead policy is low efficient, let's adjust readahead window size dynamically based on consecutiveness of dnode blocks. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 8 ++++++-- fs/f2fs/f2fs.h | 6 +++++- fs/f2fs/recovery.c | 27 ++++++++++++++++++++++++--- 3 files changed, 35 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index c7a6d48e0da4..8d18a6129b2d 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -282,18 +282,22 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, return blkno - start; } -void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) +void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index, + unsigned int ra_blocks) { struct page *page; bool readahead = false; + if (ra_blocks == RECOVERY_MIN_RA_BLOCKS) + return; + page = find_get_page(META_MAPPING(sbi), index); if (!page || !PageUptodate(page)) readahead = true; f2fs_put_page(page, 0); if (readahead) - f2fs_ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true); + f2fs_ra_meta_pages(sbi, index, ra_blocks, META_POR, true); } static int __f2fs_write_meta_page(struct page *page, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2f2a3173f2d4..084531c4cf25 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -587,6 +587,9 @@ enum { /* number of extent info in extent cache we try to shrink */ #define EXTENT_CACHE_SHRINK_NUMBER 128 +#define RECOVERY_MAX_RA_BLOCKS BIO_MAX_PAGES +#define RECOVERY_MIN_RA_BLOCKS 1 + struct rb_entry { struct rb_node rb_node; /* rb node located in rb-tree */ union { @@ -3690,7 +3693,8 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync); -void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index); +void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index, + unsigned int ra_blocks); long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, long nr_to_write, enum iostat_type io_type); void f2fs_add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index d33e63805e11..0622953210d9 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -342,6 +342,19 @@ static int recover_inode(struct inode *inode, struct page *page) return 0; } +static unsigned int adjust_por_ra_blocks(struct f2fs_sb_info *sbi, + unsigned int ra_blocks, unsigned int blkaddr, + unsigned int next_blkaddr) +{ + if (blkaddr + 1 == next_blkaddr) + ra_blocks = min_t(unsigned int, RECOVERY_MAX_RA_BLOCKS, + ra_blocks * 2); + else if (next_blkaddr % sbi->blocks_per_seg) + ra_blocks = max_t(unsigned int, RECOVERY_MIN_RA_BLOCKS, + ra_blocks / 2); + return ra_blocks; +} + static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, bool check_only) { @@ -349,6 +362,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, struct page *page = NULL; block_t blkaddr; unsigned int loop_cnt = 0; + unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS; unsigned int free_blocks = MAIN_SEGS(sbi) * sbi->blocks_per_seg - valid_user_blocks(sbi); int err = 0; @@ -423,11 +437,14 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, break; } + ra_blocks = adjust_por_ra_blocks(sbi, ra_blocks, blkaddr, + next_blkaddr_of_node(page)); + /* check next segment */ blkaddr = next_blkaddr_of_node(page); f2fs_put_page(page, 1); - f2fs_ra_meta_pages_cond(sbi, blkaddr); + f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks); } return err; } @@ -704,6 +721,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, struct page *page = NULL; int err = 0; block_t blkaddr; + unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS; /* get node pages in the current segment */ curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); @@ -715,8 +733,6 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, if (!f2fs_is_valid_blkaddr(sbi, blkaddr, META_POR)) break; - f2fs_ra_meta_pages_cond(sbi, blkaddr); - page = f2fs_get_tmp_page(sbi, blkaddr); if (IS_ERR(page)) { err = PTR_ERR(page); @@ -759,9 +775,14 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, if (entry->blkaddr == blkaddr) list_move_tail(&entry->list, tmp_inode_list); next: + ra_blocks = adjust_por_ra_blocks(sbi, ra_blocks, blkaddr, + next_blkaddr_of_node(page)); + /* check next segment */ blkaddr = next_blkaddr_of_node(page); f2fs_put_page(page, 1); + + f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks); } if (!err) f2fs_allocate_new_segments(sbi); From f4ac10b500b89aa9b6644f8ef1dfb150a1bb962c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 4 Feb 2022 15:19:46 +0800 Subject: [PATCH 372/580] f2fs: introduce F2FS_IPU_HONOR_OPU_WRITE ipu policy Once F2FS_IPU_FORCE policy is enabled in some cases: a) f2fs forces to use F2FS_IPU_FORCE in a small-sized volume b) user sets F2FS_IPU_FORCE policy via sysfs Then we may fail to defragment file due to IPU policy check, it doesn't make sense, let's introduce a new IPU policy to allow OPU during file defragmentation. In small-sized volume, let's enable F2FS_IPU_HONOR_OPU_WRITE policy by default. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 16 ++++++++++++---- fs/f2fs/data.c | 18 +++++++++++++----- fs/f2fs/f2fs.h | 3 ++- fs/f2fs/file.c | 18 +++++++++++------- fs/f2fs/segment.h | 5 ++++- fs/f2fs/super.c | 3 ++- 6 files changed, 44 insertions(+), 19 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 362014fd8290..459690ced624 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -45,10 +45,18 @@ Date: November 2013 Contact: "Jaegeuk Kim" Description: Controls the in-place-update policy. updates in f2fs. User can set: - 0x01: F2FS_IPU_FORCE, 0x02: F2FS_IPU_SSR, - 0x04: F2FS_IPU_UTIL, 0x08: F2FS_IPU_SSR_UTIL, - 0x10: F2FS_IPU_FSYNC, 0x20: F2FS_IPU_ASYNC, - 0x40: F2FS_IPU_NOCACHE. + + ==== ================= + 0x01 F2FS_IPU_FORCE + 0x02 F2FS_IPU_SSR + 0x04 F2FS_IPU_UTIL + 0x08 F2FS_IPU_SSR_UTIL + 0x10 F2FS_IPU_FSYNC + 0x20 F2FS_IPU_ASYNC + 0x40 F2FS_IPU_NOCACHE + 0x80 F2FS_IPU_HONOR_OPU_WRITE + ==== ================= + Refer segment.h for details. What: /sys/fs/f2fs//min_ipu_util diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index e86a72c0b06c..d299ca220bfd 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2513,6 +2513,9 @@ static inline bool check_inplace_update_policy(struct inode *inode, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int policy = SM_I(sbi)->ipu_policy; + if (policy & (0x1 << F2FS_IPU_HONOR_OPU_WRITE) && + is_inode_flag_set(inode, FI_OPU_WRITE)) + return false; if (policy & (0x1 << F2FS_IPU_FORCE)) return true; if (policy & (0x1 << F2FS_IPU_SSR) && f2fs_need_SSR(sbi)) @@ -2583,6 +2586,9 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) return true; + if (is_inode_flag_set(inode, FI_OPU_WRITE)) + return true; + if (fio) { if (page_private_gcing(fio->page)) return true; @@ -3209,8 +3215,8 @@ static int __f2fs_write_data_pages(struct address_space *mapping, f2fs_available_free_memory(sbi, DIRTY_DENTS)) goto skip_write; - /* skip writing during file defragment */ - if (is_inode_flag_set(inode, FI_DO_DEFRAG)) + /* skip writing in file defragment preparing stage */ + if (is_inode_flag_set(inode, FI_SKIP_WRITES)) goto skip_write; trace_f2fs_writepages(mapping->host, wbc, DATA); @@ -3932,6 +3938,7 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk, f2fs_down_write(&F2FS_I(inode)->i_mmap_sem); set_inode_flag(inode, FI_ALIGNED_WRITE); + set_inode_flag(inode, FI_OPU_WRITE); for (; secidx < end_sec; secidx++) { f2fs_down_write(&sbi->pin_sem); @@ -3940,7 +3947,7 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk, f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false); f2fs_unlock_op(sbi); - set_inode_flag(inode, FI_DO_DEFRAG); + set_inode_flag(inode, FI_SKIP_WRITES); for (blkofs = 0; blkofs < blk_per_sec; blkofs++) { struct page *page; @@ -3957,7 +3964,7 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk, f2fs_put_page(page, 1); } - clear_inode_flag(inode, FI_DO_DEFRAG); + clear_inode_flag(inode, FI_SKIP_WRITES); ret = filemap_fdatawrite(inode->i_mapping); @@ -3968,7 +3975,8 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk, } done: - clear_inode_flag(inode, FI_DO_DEFRAG); + clear_inode_flag(inode, FI_SKIP_WRITES); + clear_inode_flag(inode, FI_OPU_WRITE); clear_inode_flag(inode, FI_ALIGNED_WRITE); f2fs_up_write(&F2FS_I(inode)->i_mmap_sem); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 084531c4cf25..5217c9aa9348 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -737,7 +737,8 @@ enum { FI_DROP_CACHE, /* drop dirty page cache */ FI_DATA_EXIST, /* indicate data exists */ FI_INLINE_DOTS, /* indicate inline dot dentries */ - FI_DO_DEFRAG, /* indicate defragment is running */ + FI_SKIP_WRITES, /* should skip data page writeback */ + FI_OPU_WRITE, /* used for opu per file */ FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ FI_PREALLOCATED_ALL, /* all blocks for write were preallocated */ FI_HOT_DATA, /* indicate file is hot */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f9d241e5f387..c9153f4c467d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2638,10 +2638,6 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, bool fragmented = false; int err; - /* if in-place-update policy is enabled, don't waste time here */ - if (f2fs_should_update_inplace(inode, NULL)) - return -EINVAL; - pg_start = range->start >> PAGE_SHIFT; pg_end = (range->start + range->len) >> PAGE_SHIFT; @@ -2649,6 +2645,13 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, inode_lock(inode); + /* if in-place-update policy is enabled, don't waste time here */ + set_inode_flag(inode, FI_OPU_WRITE); + if (f2fs_should_update_inplace(inode, NULL)) { + err = -EINVAL; + goto out; + } + /* writeback all dirty pages in the range */ err = filemap_write_and_wait_range(inode->i_mapping, range->start, range->start + range->len - 1); @@ -2730,7 +2733,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, goto check; } - set_inode_flag(inode, FI_DO_DEFRAG); + set_inode_flag(inode, FI_SKIP_WRITES); idx = map.m_lblk; while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) { @@ -2755,15 +2758,16 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, if (map.m_lblk < pg_end && cnt < blk_per_seg) goto do_map; - clear_inode_flag(inode, FI_DO_DEFRAG); + clear_inode_flag(inode, FI_SKIP_WRITES); err = filemap_fdatawrite(inode->i_mapping); if (err) goto out; } clear_out: - clear_inode_flag(inode, FI_DO_DEFRAG); + clear_inode_flag(inode, FI_SKIP_WRITES); out: + clear_inode_flag(inode, FI_OPU_WRITE); inode_unlock(inode); if (!err) range->len = (u64)total << PAGE_SHIFT; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 3c01315931f6..aeb008f2377b 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -651,7 +651,9 @@ static inline int utilization(struct f2fs_sb_info *sbi) * pages over min_fsync_blocks. (=default option) * F2FS_IPU_ASYNC - do IPU given by asynchronous write requests. * F2FS_IPU_NOCACHE - disable IPU bio cache. - * F2FS_IPUT_DISABLE - disable IPU. (=default option in LFS mode) + * F2FS_IPU_HONOR_OPU_WRITE - use OPU write prior to IPU write if inode has + * FI_OPU_WRITE flag. + * F2FS_IPU_DISABLE - disable IPU. (=default option in LFS mode) */ #define DEF_MIN_IPU_UTIL 70 #define DEF_MIN_FSYNC_BLOCKS 8 @@ -667,6 +669,7 @@ enum { F2FS_IPU_FSYNC, F2FS_IPU_ASYNC, F2FS_IPU_NOCACHE, + F2FS_IPU_HONOR_OPU_WRITE, }; static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 54c129050b60..8155b767c11c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3919,7 +3919,8 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; if (f2fs_block_unit_discard(sbi)) sm_i->dcc_info->discard_granularity = 1; - sm_i->ipu_policy = 1 << F2FS_IPU_FORCE; + sm_i->ipu_policy = 1 << F2FS_IPU_FORCE | + 1 << F2FS_IPU_HONOR_OPU_WRITE; } sbi->readdir_ra = 1; From 89d0feecf5cf816038e49c40ce0193de6e8a4901 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 27 Jan 2022 13:31:43 -0800 Subject: [PATCH 373/580] f2fs: add a way to limit roll forward recovery time This adds a sysfs entry to call checkpoint during fsync() in order to avoid long elapsed time to run roll-forward recovery when booting the device. Default value doesn't enforce the limitation which is same as before. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 6 ++++++ fs/f2fs/checkpoint.c | 1 + fs/f2fs/debug.c | 3 +++ fs/f2fs/f2fs.h | 3 +++ fs/f2fs/node.c | 2 ++ fs/f2fs/node.h | 3 +++ fs/f2fs/recovery.c | 4 ++++ fs/f2fs/super.c | 14 ++++++++++++-- fs/f2fs/sysfs.c | 2 ++ 9 files changed, 36 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 459690ced624..aa244a4d46b8 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -549,3 +549,9 @@ Contact: "Daeho Jeong" Description: You can set the trial count limit for GC urgent high mode with this value. If GC thread gets to the limit, the mode will turn back to GC normal mode. By default, the value is zero, which means there is no limit like before. + +What: /sys/fs/f2fs//max_roll_forward_node_blocks +Date: January 2022 +Contact: "Jaegeuk Kim" +Description: Controls max # of node block writes to be used for roll forward + recovery. This can limit the roll forward recovery time. diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 8d18a6129b2d..4479c6a1ccdf 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1547,6 +1547,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* update user_block_counts */ sbi->last_valid_block_count = sbi->total_valid_block_count; percpu_counter_set(&sbi->alloc_valid_block_count, 0); + percpu_counter_set(&sbi->rf_node_block_count, 0); /* Here, we have one bio having CP pack except cp pack 2 page */ f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 8c50518475a9..9a13902c7702 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -532,6 +532,9 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_meta, si->meta_pages); seq_printf(s, " - imeta: %4d\n", si->ndirty_imeta); + seq_printf(s, " - fsync mark: %4lld\n", + percpu_counter_sum_positive( + &si->sbi->rf_node_block_count)); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); seq_printf(s, " - free_nids: %9d/%9d\n - alloc_nids: %9d\n", diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5217c9aa9348..ce93f7f69953 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -916,6 +916,7 @@ struct f2fs_nm_info { nid_t max_nid; /* maximum possible node ids */ nid_t available_nids; /* # of available node ids */ nid_t next_scan_nid; /* the next nid to be scanned */ + nid_t max_rf_node_blocks; /* max # of nodes for recovery */ unsigned int ram_thresh; /* control the memory footprint */ unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */ unsigned int dirty_nats_ratio; /* control dirty nats ratio threshold */ @@ -1692,6 +1693,8 @@ struct f2fs_sb_info { atomic_t nr_pages[NR_COUNT_TYPE]; /* # of allocated blocks */ struct percpu_counter alloc_valid_block_count; + /* # of node block writes as roll forward recovery */ + struct percpu_counter rf_node_block_count; /* writeback control */ atomic_t wb_sync_req[META]; /* count # of WB_SYNC threads */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 56d08c6caca0..66d4d17d0bc1 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1782,6 +1782,7 @@ int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, if (!atomic || page == last_page) { set_fsync_mark(page, 1); + percpu_counter_inc(&sbi->rf_node_block_count); if (IS_INODE(page)) { if (is_inode_flag_set(inode, FI_DIRTY_INODE)) @@ -3218,6 +3219,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) nm_i->ram_thresh = DEF_RAM_THRESHOLD; nm_i->ra_nid_pages = DEF_RA_NID_PAGES; nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD; + nm_i->max_rf_node_blocks = DEF_RF_NODE_BLOCKS; INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->free_nid_list); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 18b98cf0465b..4c1d34bfea78 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -31,6 +31,9 @@ /* control total # of nats */ #define DEF_NAT_CACHE_THRESHOLD 100000 +/* control total # of node writes used for roll-fowrad recovery */ +#define DEF_RF_NODE_BLOCKS 0 + /* vector size for gang look-up from nat cache that consists of radix tree */ #define NATVEC_SIZE 64 #define SETVEC_SIZE 32 diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 0622953210d9..fc85d8f8ce65 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -55,6 +55,10 @@ bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi) if (sbi->last_valid_block_count + nalloc > sbi->user_block_count) return false; + if (NM_I(sbi)->max_rf_node_blocks && + percpu_counter_sum_positive(&sbi->rf_node_block_count) >= + NM_I(sbi)->max_rf_node_blocks) + return false; return true; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8155b767c11c..690d4d2be3e0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1504,8 +1504,9 @@ static void f2fs_destroy_inode(struct inode *inode) static void destroy_percpu_info(struct f2fs_sb_info *sbi) { - percpu_counter_destroy(&sbi->alloc_valid_block_count); percpu_counter_destroy(&sbi->total_valid_inode_count); + percpu_counter_destroy(&sbi->rf_node_block_count); + percpu_counter_destroy(&sbi->alloc_valid_block_count); } static void destroy_device_list(struct f2fs_sb_info *sbi) @@ -3595,11 +3596,20 @@ static int init_percpu_info(struct f2fs_sb_info *sbi) if (err) return err; + err = percpu_counter_init(&sbi->rf_node_block_count, 0, GFP_KERNEL); + if (err) + goto err_valid_block; + err = percpu_counter_init(&sbi->total_valid_inode_count, 0, GFP_KERNEL); if (err) - percpu_counter_destroy(&sbi->alloc_valid_block_count); + goto err_node_block; + return 0; +err_node_block: + percpu_counter_destroy(&sbi->rf_node_block_count); +err_valid_block: + percpu_counter_destroy(&sbi->alloc_valid_block_count); return err; } diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 1b13d741fe02..dced8a2dc034 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -728,6 +728,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ssr_sections, min_ssr_sections); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, max_roll_forward_node_blocks, max_rf_node_blocks); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, migration_granularity, migration_granularity); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); @@ -850,6 +851,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(ram_thresh), ATTR_LIST(ra_nid_pages), ATTR_LIST(dirty_nats_ratio), + ATTR_LIST(max_roll_forward_node_blocks), ATTR_LIST(cp_interval), ATTR_LIST(idle_interval), ATTR_LIST(discard_idle_interval), From f5b24a11cb236fa96125488397274e713768e1d1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 11 Feb 2022 18:56:46 -0800 Subject: [PATCH 374/580] f2fs: fix missing free nid in f2fs_handle_failed_inode This patch fixes xfstests/generic/475 failure. [ 293.680694] F2FS-fs (dm-1): May loss orphan inode, run fsck to fix. [ 293.685358] Buffer I/O error on dev dm-1, logical block 8388592, async page read [ 293.691527] Buffer I/O error on dev dm-1, logical block 8388592, async page read [ 293.691764] sh (7615): drop_caches: 3 [ 293.691819] sh (7616): drop_caches: 3 [ 293.694017] Buffer I/O error on dev dm-1, logical block 1, async page read [ 293.695659] sh (7618): drop_caches: 3 [ 293.696979] sh (7617): drop_caches: 3 [ 293.700290] sh (7623): drop_caches: 3 [ 293.708621] sh (7626): drop_caches: 3 [ 293.711386] sh (7628): drop_caches: 3 [ 293.711825] sh (7627): drop_caches: 3 [ 293.716738] sh (7630): drop_caches: 3 [ 293.719613] sh (7632): drop_caches: 3 [ 293.720971] sh (7633): drop_caches: 3 [ 293.727741] sh (7634): drop_caches: 3 [ 293.730783] sh (7636): drop_caches: 3 [ 293.732681] sh (7635): drop_caches: 3 [ 293.732988] sh (7637): drop_caches: 3 [ 293.738836] sh (7639): drop_caches: 3 [ 293.740568] sh (7641): drop_caches: 3 [ 293.743053] sh (7640): drop_caches: 3 [ 293.821889] ------------[ cut here ]------------ [ 293.824654] kernel BUG at fs/f2fs/node.c:3334! [ 293.826226] invalid opcode: 0000 [#1] PREEMPT SMP PTI [ 293.828713] CPU: 0 PID: 7653 Comm: umount Tainted: G OE 5.17.0-rc1-custom #1 [ 293.830946] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 [ 293.832526] RIP: 0010:f2fs_destroy_node_manager+0x33f/0x350 [f2fs] [ 293.833905] Code: e8 d6 3d f9 f9 48 8b 45 d0 65 48 2b 04 25 28 00 00 00 75 1a 48 81 c4 28 03 00 00 5b 41 5c 41 5d 41 5e 41 5f 5d c3 0f 0b [ 293.837783] RSP: 0018:ffffb04ec31e7a20 EFLAGS: 00010202 [ 293.839062] RAX: 0000000000000001 RBX: ffff9df947db2eb8 RCX: 0000000080aa0072 [ 293.840666] RDX: 0000000000000000 RSI: ffffe86c0432a140 RDI: ffffffffc0b72a21 [ 293.842261] RBP: ffffb04ec31e7d70 R08: ffff9df94ca85780 R09: 0000000080aa0072 [ 293.843909] R10: ffff9df94ca85700 R11: ffff9df94e1ccf58 R12: ffff9df947db2e00 [ 293.845594] R13: ffff9df947db2ed0 R14: ffff9df947db2eb8 R15: ffff9df947db2eb8 [ 293.847855] FS: 00007f5a97379800(0000) GS:ffff9dfa77c00000(0000) knlGS:0000000000000000 [ 293.850647] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 293.852940] CR2: 00007f5a97528730 CR3: 000000010bc76005 CR4: 0000000000370ef0 [ 293.854680] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 293.856423] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 293.858380] Call Trace: [ 293.859302] [ 293.860311] ? ttwu_do_wakeup+0x1c/0x170 [ 293.861800] ? ttwu_do_activate+0x6d/0xb0 [ 293.863057] ? _raw_spin_unlock_irqrestore+0x29/0x40 [ 293.864411] ? try_to_wake_up+0x9d/0x5e0 [ 293.865618] ? debug_smp_processor_id+0x17/0x20 [ 293.866934] ? debug_smp_processor_id+0x17/0x20 [ 293.868223] ? free_unref_page+0xbf/0x120 [ 293.869470] ? __free_slab+0xcb/0x1c0 [ 293.870614] ? preempt_count_add+0x7a/0xc0 [ 293.871811] ? __slab_free+0xa0/0x2d0 [ 293.872918] ? __wake_up_common_lock+0x8a/0xc0 [ 293.874186] ? __slab_free+0xa0/0x2d0 [ 293.875305] ? free_inode_nonrcu+0x20/0x20 [ 293.876466] ? free_inode_nonrcu+0x20/0x20 [ 293.877650] ? debug_smp_processor_id+0x17/0x20 [ 293.878949] ? call_rcu+0x11a/0x240 [ 293.880060] ? f2fs_destroy_stats+0x59/0x60 [f2fs] [ 293.881437] ? kfree+0x1fe/0x230 [ 293.882674] f2fs_put_super+0x160/0x390 [f2fs] [ 293.883978] generic_shutdown_super+0x7a/0x120 [ 293.885274] kill_block_super+0x27/0x50 [ 293.886496] kill_f2fs_super+0x7f/0x100 [f2fs] [ 293.887806] deactivate_locked_super+0x35/0xa0 [ 293.889271] deactivate_super+0x40/0x50 [ 293.890513] cleanup_mnt+0x139/0x190 [ 293.891689] __cleanup_mnt+0x12/0x20 [ 293.892850] task_work_run+0x64/0xa0 [ 293.894035] exit_to_user_mode_prepare+0x1b7/0x1c0 [ 293.895409] syscall_exit_to_user_mode+0x27/0x50 [ 293.896872] do_syscall_64+0x48/0xc0 [ 293.898090] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 293.899517] RIP: 0033:0x7f5a975cd25b Fixes: 7735730d39d7 ("f2fs: fix to propagate error from __get_meta_page()") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index edfca0836f26..8f46a940871b 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -885,6 +885,7 @@ void f2fs_handle_failed_inode(struct inode *inode) err = f2fs_get_node_info(sbi, inode->i_ino, &ni, false); if (err) { set_sbi_flag(sbi, SBI_NEED_FSCK); + set_inode_flag(inode, FI_FREE_NID); f2fs_warn(sbi, "May loss orphan inode, run fsck to fix."); goto out; } From cd166a37c05d740c69bd0340e37882d4ef4655d1 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 22 Feb 2022 10:43:13 -0800 Subject: [PATCH 375/580] f2fs: Restore rwsem lockdep support Lockdep uses lock class keys in its analysis. init_rwsem() instantiates one lock class key with each init_rwsem() user as follows: #define init_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ __init_rwsem((sem), #sem, &__key); \ } while (0) Commit e4544b63a7ee ("f2fs: move f2fs to use reader-unfair rwsems") reduced the number of lock class keys from one per init_rwsem() user to one per file in which init_f2fs_rwsem() is used. This causes the same lock class key to be associated with multiple f2fs rwsems and also triggers a number of false positive lockdep deadlock reports. Fix this by again instantiating one lock class key with each init_f2fs_rwsem() caller. Cc: Tim Murray Reported-by: syzbot+0b9cadf5fc45a98a5083@syzkaller.appspotmail.com Fixes: e4544b63a7ee ("f2fs: move f2fs to use reader-unfair rwsems") Signed-off-by: Bart Van Assche Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ce93f7f69953..4e9a805c1680 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2125,9 +2125,17 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) spin_unlock_irqrestore(&sbi->cp_lock, flags); } -static inline void init_f2fs_rwsem(struct f2fs_rwsem *sem) +#define init_f2fs_rwsem(sem) \ +do { \ + static struct lock_class_key __key; \ + \ + __init_f2fs_rwsem((sem), #sem, &__key); \ +} while (0) + +static inline void __init_f2fs_rwsem(struct f2fs_rwsem *sem, + const char *sem_name, struct lock_class_key *key) { - init_rwsem(&sem->internal_rwsem); + __init_rwsem(&sem->internal_rwsem, sem_name, key); init_waitqueue_head(&sem->read_waiters); } From d5f1cc2de68339dde866a668f3814dfdcbf65d6e Mon Sep 17 00:00:00 2001 From: Juhyung Park Date: Tue, 15 Feb 2022 17:27:21 +0900 Subject: [PATCH 376/580] f2fs: quota: fix loop condition at f2fs_quota_sync() cnt should be passed to sb_has_quota_active() instead of type to check active quota properly. Moreover, when the type is -1, the compiler with enough inline knowledge can discard sb_has_quota_active() check altogether, causing a NULL pointer dereference at the following inode_lock(dqopt->files[cnt]): [ 2.796010] Unable to handle kernel NULL pointer dereference at virtual address 00000000000000a0 [ 2.796024] Mem abort info: [ 2.796025] ESR = 0x96000005 [ 2.796028] EC = 0x25: DABT (current EL), IL = 32 bits [ 2.796029] SET = 0, FnV = 0 [ 2.796031] EA = 0, S1PTW = 0 [ 2.796032] Data abort info: [ 2.796034] ISV = 0, ISS = 0x00000005 [ 2.796035] CM = 0, WnR = 0 [ 2.796046] user pgtable: 4k pages, 39-bit VAs, pgdp=00000003370d1000 [ 2.796048] [00000000000000a0] pgd=0000000000000000, pud=0000000000000000 [ 2.796051] Internal error: Oops: 96000005 [#1] PREEMPT SMP [ 2.796056] CPU: 7 PID: 640 Comm: f2fs_ckpt-259:7 Tainted: G S 5.4.179-arter97-r8-64666-g2f16e087f9d8 #1 [ 2.796057] Hardware name: Qualcomm Technologies, Inc. Lahaina MTP lemonadep (DT) [ 2.796059] pstate: 80c00005 (Nzcv daif +PAN +UAO) [ 2.796065] pc : down_write+0x28/0x70 [ 2.796070] lr : f2fs_quota_sync+0x100/0x294 [ 2.796071] sp : ffffffa3f48ffc30 [ 2.796073] x29: ffffffa3f48ffc30 x28: 0000000000000000 [ 2.796075] x27: ffffffa3f6d718b8 x26: ffffffa415fe9d80 [ 2.796077] x25: ffffffa3f7290048 x24: 0000000000000001 [ 2.796078] x23: 0000000000000000 x22: ffffffa3f7290000 [ 2.796080] x21: ffffffa3f72904a0 x20: ffffffa3f7290110 [ 2.796081] x19: ffffffa3f77a9800 x18: ffffffc020aae038 [ 2.796083] x17: ffffffa40e38e040 x16: ffffffa40e38e6d0 [ 2.796085] x15: ffffffa40e38e6cc x14: ffffffa40e38e6d0 [ 2.796086] x13: 00000000000004f6 x12: 00162c44ff493000 [ 2.796088] x11: 0000000000000400 x10: ffffffa40e38c948 [ 2.796090] x9 : 0000000000000000 x8 : 00000000000000a0 [ 2.796091] x7 : 0000000000000000 x6 : 0000d1060f00002a [ 2.796093] x5 : ffffffa3f48ff718 x4 : 000000000000000d [ 2.796094] x3 : 00000000060c0000 x2 : 0000000000000001 [ 2.796096] x1 : 0000000000000000 x0 : 00000000000000a0 [ 2.796098] Call trace: [ 2.796100] down_write+0x28/0x70 [ 2.796102] f2fs_quota_sync+0x100/0x294 [ 2.796104] block_operations+0x120/0x204 [ 2.796106] f2fs_write_checkpoint+0x11c/0x520 [ 2.796107] __checkpoint_and_complete_reqs+0x7c/0xd34 [ 2.796109] issue_checkpoint_thread+0x6c/0xb8 [ 2.796112] kthread+0x138/0x414 [ 2.796114] ret_from_fork+0x10/0x18 [ 2.796117] Code: aa0803e0 aa1f03e1 52800022 aa0103e9 (c8e97d02) [ 2.796120] ---[ end trace 96e942e8eb6a0b53 ]--- [ 2.800116] Kernel panic - not syncing: Fatal exception [ 2.800120] SMP: stopping secondary CPUs Fixes: 9de71ede81e6 ("f2fs: quota: fix potential deadlock") Cc: # v5.15+ Signed-off-by: Juhyung Park Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 690d4d2be3e0..2798587f9780 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2685,7 +2685,7 @@ int f2fs_quota_sync(struct super_block *sb, int type) struct f2fs_sb_info *sbi = F2FS_SB(sb); struct quota_info *dqopt = sb_dqopt(sb); int cnt; - int ret; + int ret = 0; /* * Now when everything is written we can discard the pagecache so @@ -2696,8 +2696,8 @@ int f2fs_quota_sync(struct super_block *sb, int type) if (type != -1 && cnt != type) continue; - if (!sb_has_quota_active(sb, type)) - return 0; + if (!sb_has_quota_active(sb, cnt)) + continue; inode_lock(dqopt->files[cnt]); From 95cdf71a22a90da36fa97f2b02d2cb90c8e09509 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 27 Jan 2022 13:44:49 +0800 Subject: [PATCH 377/580] f2fs: fix to avoid potential deadlock Quoted from Jing Xia's report, there is a potential deadlock may happen between kworker and checkpoint as below: [T:writeback] [T:checkpoint] - wb_writeback - blk_start_plug bio contains NodeA was plugged in writeback threads - do_writepages -- sync write inodeB, inc wb_sync_req[DATA] - f2fs_write_data_pages - f2fs_write_single_data_page -- write last dirty page - f2fs_do_write_data_page - set_page_writeback -- clear page dirty flag and PAGECACHE_TAG_DIRTY tag in radix tree - f2fs_outplace_write_data - f2fs_update_data_blkaddr - f2fs_wait_on_page_writeback -- wait NodeA to writeback here - inode_dec_dirty_pages - writeback_sb_inodes - writeback_single_inode - do_writepages - f2fs_write_data_pages -- skip writepages due to wb_sync_req[DATA] - wbc->pages_skipped += get_dirty_pages() -- PAGECACHE_TAG_DIRTY is not set but get_dirty_pages() returns one - requeue_inode -- requeue inode to wb->b_dirty queue due to non-zero.pages_skipped - blk_finish_plug Let's try to avoid deadlock condition by forcing unplugging previous bio via blk_finish_plug(current->plug) once we'v skipped writeback in writepages() due to valid sbi->wb_sync_req[DATA/NODE]. Fixes: 687de7f1010c ("f2fs: avoid IO split due to mixed WB_SYNC_ALL and WB_SYNC_NONE") Signed-off-by: Zhiguo Niu Signed-off-by: Jing Xia Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 6 +++++- fs/f2fs/node.c | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d299ca220bfd..c8dfee84f598 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3224,8 +3224,12 @@ static int __f2fs_write_data_pages(struct address_space *mapping, /* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */ if (wbc->sync_mode == WB_SYNC_ALL) atomic_inc(&sbi->wb_sync_req[DATA]); - else if (atomic_read(&sbi->wb_sync_req[DATA])) + else if (atomic_read(&sbi->wb_sync_req[DATA])) { + /* to avoid potential deadlock */ + if (current->plug) + blk_finish_plug(current->plug); goto skip_write; + } if (__should_serialize_io(inode, wbc)) { mutex_lock(&sbi->writepages); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 66d4d17d0bc1..b6d250f954ca 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2112,8 +2112,12 @@ static int f2fs_write_node_pages(struct address_space *mapping, if (wbc->sync_mode == WB_SYNC_ALL) atomic_inc(&sbi->wb_sync_req[NODE]); - else if (atomic_read(&sbi->wb_sync_req[NODE])) + else if (atomic_read(&sbi->wb_sync_req[NODE])) { + /* to avoid potential deadlock */ + if (current->plug) + blk_finish_plug(current->plug); goto skip_write; + } trace_f2fs_writepages(mapping->host, wbc, NODE); From 9a1767d889d1d08daeacd4ad73bb9c483b046a30 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 4 Mar 2022 09:49:13 +0800 Subject: [PATCH 378/580] f2fs: fix to do sanity check on curseg->alloc_type As Wenqing Liu reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215657 - Overview UBSAN: array-index-out-of-bounds in fs/f2fs/segment.c:3460:2 when mount and operate a corrupted image - Reproduce tested on kernel 5.17-rc4, 5.17-rc6 1. mkdir test_crash 2. cd test_crash 3. unzip tmp2.zip 4. mkdir mnt 5. ./single_test.sh f2fs 2 - Kernel dump [ 46.434454] loop0: detected capacity change from 0 to 131072 [ 46.529839] F2FS-fs (loop0): Mounted with checkpoint version = 7548c2d9 [ 46.738319] ================================================================================ [ 46.738412] UBSAN: array-index-out-of-bounds in fs/f2fs/segment.c:3460:2 [ 46.738475] index 231 is out of range for type 'unsigned int [2]' [ 46.738539] CPU: 2 PID: 939 Comm: umount Not tainted 5.17.0-rc6 #1 [ 46.738547] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-1ubuntu1.1 04/01/2014 [ 46.738551] Call Trace: [ 46.738556] [ 46.738563] dump_stack_lvl+0x47/0x5c [ 46.738581] ubsan_epilogue+0x5/0x50 [ 46.738592] __ubsan_handle_out_of_bounds+0x68/0x80 [ 46.738604] f2fs_allocate_data_block+0xdff/0xe60 [f2fs] [ 46.738819] do_write_page+0xef/0x210 [f2fs] [ 46.738934] f2fs_do_write_node_page+0x3f/0x80 [f2fs] [ 46.739038] __write_node_page+0x2b7/0x920 [f2fs] [ 46.739162] f2fs_sync_node_pages+0x943/0xb00 [f2fs] [ 46.739293] f2fs_write_checkpoint+0x7bb/0x1030 [f2fs] [ 46.739405] kill_f2fs_super+0x125/0x150 [f2fs] [ 46.739507] deactivate_locked_super+0x60/0xc0 [ 46.739517] deactivate_super+0x70/0xb0 [ 46.739524] cleanup_mnt+0x11a/0x200 [ 46.739532] __cleanup_mnt+0x16/0x20 [ 46.739538] task_work_run+0x67/0xa0 [ 46.739547] exit_to_user_mode_prepare+0x18c/0x1a0 [ 46.739559] syscall_exit_to_user_mode+0x26/0x40 [ 46.739568] do_syscall_64+0x46/0xb0 [ 46.739584] entry_SYSCALL_64_after_hwframe+0x44/0xae The root cause is we missed to do sanity check on curseg->alloc_type, result in out-of-bound accessing on sbi->block_count[] array, fix it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b3847165c65e..95041d42d30b 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -4795,6 +4795,13 @@ static int sanity_check_curseg(struct f2fs_sb_info *sbi) sanity_check_seg_type(sbi, curseg->seg_type); + if (curseg->alloc_type != LFS && curseg->alloc_type != SSR) { + f2fs_err(sbi, + "Current segment has invalid alloc_type:%d", + curseg->alloc_type); + return -EFSCORRUPTED; + } + if (f2fs_test_bit(blkofs, se->cur_valid_map)) goto out; From 88037f0b0bb6274388c1e0646f9b0b53f13a0e42 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 2 Mar 2022 11:39:08 -0800 Subject: [PATCH 379/580] f2fs: avoid an infinite loop in f2fs_sync_dirty_inodes If one read IO is always failing, we can fall into an infinite loop in f2fs_sync_dirty_inodes. This happens during xfstests/generic/475. [ 142.803335] Buffer I/O error on dev dm-1, logical block 8388592, async page read ... [ 382.887210] submit_bio_noacct+0xdd/0x2a0 [ 382.887213] submit_bio+0x80/0x110 [ 382.887223] __submit_bio+0x4d/0x300 [f2fs] [ 382.887282] f2fs_submit_page_bio+0x125/0x200 [f2fs] [ 382.887299] __get_meta_page+0xc9/0x280 [f2fs] [ 382.887315] f2fs_get_meta_page+0x13/0x20 [f2fs] [ 382.887331] f2fs_get_node_info+0x317/0x3c0 [f2fs] [ 382.887350] f2fs_do_write_data_page+0x327/0x6f0 [f2fs] [ 382.887367] f2fs_write_single_data_page+0x5b7/0x960 [f2fs] [ 382.887386] f2fs_write_cache_pages+0x302/0x890 [f2fs] [ 382.887405] ? preempt_count_add+0x7a/0xc0 [ 382.887408] f2fs_write_data_pages+0xfd/0x320 [f2fs] [ 382.887425] ? _raw_spin_unlock+0x1a/0x30 [ 382.887428] do_writepages+0xd3/0x1d0 [ 382.887432] filemap_fdatawrite_wbc+0x69/0x90 [ 382.887434] filemap_fdatawrite+0x50/0x70 [ 382.887437] f2fs_sync_dirty_inodes+0xa4/0x270 [f2fs] [ 382.887453] f2fs_write_checkpoint+0x189/0x1640 [f2fs] [ 382.887469] ? schedule_timeout+0x114/0x150 [ 382.887471] ? ttwu_do_activate+0x6d/0xb0 [ 382.887473] ? preempt_count_add+0x7a/0xc0 [ 382.887476] kill_f2fs_super+0xca/0x100 [f2fs] [ 382.887491] deactivate_locked_super+0x35/0xa0 [ 382.887494] deactivate_super+0x40/0x50 [ 382.887497] cleanup_mnt+0x139/0x190 [ 382.887499] __cleanup_mnt+0x12/0x20 [ 382.887501] task_work_run+0x64/0xa0 [ 382.887505] exit_to_user_mode_prepare+0x1b7/0x1c0 [ 382.887508] syscall_exit_to_user_mode+0x27/0x50 [ 382.887510] do_syscall_64+0x48/0xc0 [ 382.887513] entry_SYSCALL_64_after_hwframe+0x44/0xae Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 7 +++++++ fs/f2fs/f2fs.h | 5 +++++ 2 files changed, 12 insertions(+) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 4479c6a1ccdf..dea5b5c57bd8 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -98,6 +98,13 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, } if (unlikely(!PageUptodate(page))) { + if (page->index == sbi->metapage_eio_ofs && + sbi->metapage_eio_cnt++ == MAX_RETRY_META_PAGE_EIO) { + set_ckpt_flags(sbi, CP_ERROR_FLAG); + } else { + sbi->metapage_eio_ofs = page->index; + sbi->metapage_eio_cnt = 0; + } f2fs_put_page(page, 1); return ERR_PTR(-EIO); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4e9a805c1680..74ce75f79e54 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -574,6 +574,9 @@ enum { /* maximum retry quota flush count */ #define DEFAULT_RETRY_QUOTA_FLUSH_COUNT 8 +/* maximum retry of EIO'ed meta page */ +#define MAX_RETRY_META_PAGE_EIO 100 + #define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */ #define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */ @@ -1618,6 +1621,8 @@ struct f2fs_sb_info { /* keep migration IO order for LFS mode */ struct f2fs_rwsem io_order_lock; mempool_t *write_io_dummy; /* Dummy pages */ + pgoff_t metapage_eio_ofs; /* EIO page offset */ + int metapage_eio_cnt; /* EIO count */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ From 0183719cbaec1a88526c12af172110b2b582215d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 3 Mar 2022 13:21:27 -0800 Subject: [PATCH 380/580] f2fs: introduce F2FS_UNFAIR_RWSEM to support unfair rwsem Unfair rwsem should be used when blk-cg is on. Otherwise, there is regression. FYI, we noticed a -26.7% regression of aim7.jobs-per-min due to commit: commit: e4544b63a7ee49e7fbebf35ece0a6acd3b9617ae ("f2fs: move f2fs to use reader-unfair rwsems") https://git.kernel.org/cgit/linux/kernel/git/next/linux-next.git master in testcase: aim7 on test machine: 88 threads 2 sockets Intel(R) Xeon(R) Gold 6238M CPU @ 2.10GHz with 128G memory with following parameters: disk: 4BRD_12G md: RAID0 fs: f2fs test: sync_disk_rw load: 100 cpufreq_governor: performance ucode: 0x500320a test-description: AIM7 is a traditional UNIX system level benchmark suite which is used to test and measure the performance of multiuser system. test-url: https://sourceforge.net/projects/aimbench/files/aim-suite7/ Reported-by: kernel test robot Signed-off-by: Jaegeuk Kim --- fs/f2fs/Kconfig | 7 +++++++ fs/f2fs/f2fs.h | 10 ++++++++++ 2 files changed, 17 insertions(+) diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 0da1790e036a..404ac7af2ca7 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -150,3 +150,10 @@ config F2FS_IOSTAT Support getting IO statistics through sysfs and printing out periodic IO statistics tracepoint events. You have to turn on "iostat_enable" sysfs node to enable this feature. + +config F2FS_UNFAIR_RWSEM + bool "F2FS unfair rw_semaphore" + depends on F2FS_FS && BLK_CGROUP + help + Use unfair rw_semaphore, if system configured IO priority by block + cgroup. diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 74ce75f79e54..a7509040d8fe 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -129,7 +129,9 @@ typedef u32 nid_t; struct f2fs_rwsem { struct rw_semaphore internal_rwsem; +#ifdef CONFIG_F2FS_UNFAIR_RWSEM wait_queue_head_t read_waiters; +#endif }; struct f2fs_mount_info { @@ -2141,7 +2143,9 @@ static inline void __init_f2fs_rwsem(struct f2fs_rwsem *sem, const char *sem_name, struct lock_class_key *key) { __init_rwsem(&sem->internal_rwsem, sem_name, key); +#ifdef CONFIG_F2FS_UNFAIR_RWSEM init_waitqueue_head(&sem->read_waiters); +#endif } static inline int f2fs_rwsem_is_locked(struct f2fs_rwsem *sem) @@ -2156,7 +2160,11 @@ static inline int f2fs_rwsem_is_contended(struct f2fs_rwsem *sem) static inline void f2fs_down_read(struct f2fs_rwsem *sem) { +#ifdef CONFIG_F2FS_UNFAIR_RWSEM wait_event(sem->read_waiters, down_read_trylock(&sem->internal_rwsem)); +#else + down_read(&sem->internal_rwsem); +#endif } static inline int f2fs_down_read_trylock(struct f2fs_rwsem *sem) @@ -2191,7 +2199,9 @@ static inline int f2fs_down_write_trylock(struct f2fs_rwsem *sem) static inline void f2fs_up_write(struct f2fs_rwsem *sem) { up_write(&sem->internal_rwsem); +#ifdef CONFIG_F2FS_UNFAIR_RWSEM wake_up_all(&sem->read_waiters); +#endif } static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) From 69f0e3ea0c61ac53a1ad51276cedf3a4812d9976 Mon Sep 17 00:00:00 2001 From: Jia Yang Date: Wed, 9 Mar 2022 19:09:35 +0800 Subject: [PATCH 381/580] f2fs: remove unnecessary read for F2FS_FITS_IN_INODE F2FS_FITS_IN_INODE only cares the type of f2fs inode, so there is no need to read node page of f2fs inode. Signed-off-by: Jia Yang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c9153f4c467d..4145466cf3d9 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -831,7 +831,7 @@ int f2fs_getattr(const struct path *path, struct kstat *stat, { struct inode *inode = d_inode(path->dentry); struct f2fs_inode_info *fi = F2FS_I(inode); - struct f2fs_inode *ri; + struct f2fs_inode *ri = NULL; unsigned int flags; if (f2fs_has_extra_attr(inode) && @@ -3074,7 +3074,7 @@ static int f2fs_ioc_setproject(struct file *filp, __u32 projid) struct inode *inode = file_inode(filp); struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct page *ipage; + struct f2fs_inode *ri = NULL; kprojid_t kprojid; int err; @@ -3098,17 +3098,8 @@ static int f2fs_ioc_setproject(struct file *filp, __u32 projid) if (IS_NOQUOTA(inode)) return err; - ipage = f2fs_get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) - return PTR_ERR(ipage); - - if (!F2FS_FITS_IN_INODE(F2FS_INODE(ipage), fi->i_extra_isize, - i_projid)) { - err = -EOVERFLOW; - f2fs_put_page(ipage, 1); - return err; - } - f2fs_put_page(ipage, 1); + if (!F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_projid)) + return -EOVERFLOW; err = f2fs_dquot_initialize(inode); if (err) From f8282fa1b262544717b75d0d09f4a1c89fe2d904 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 4 Mar 2022 09:40:05 -0800 Subject: [PATCH 382/580] f2fs: don't get FREEZE lock in f2fs_evict_inode in frozen fs Let's purge inode cache in order to avoid the below deadlock. [freeze test] shrinkder freeze_super - pwercpu_down_write(SB_FREEZE_FS) - super_cache_scan - down_read(&sb->s_umount) - prune_icache_sb - dispose_list - evict - f2fs_evict_inode thaw_super - down_write(&sb->s_umount); - __percpu_down_read(SB_FREEZE_FS) Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 1 + fs/f2fs/debug.c | 1 + fs/f2fs/f2fs.h | 1 + fs/f2fs/inode.c | 6 ++++-- fs/f2fs/super.c | 4 ++++ 5 files changed, 11 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index aa244a4d46b8..5189dfebbadf 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -445,6 +445,7 @@ Description: Show status of f2fs superblock in real time. 0x800 SBI_QUOTA_SKIP_FLUSH skip flushing quota in current CP 0x1000 SBI_QUOTA_NEED_REPAIR quota file may be corrupted 0x2000 SBI_IS_RESIZEFS resizefs is in process + 0x4000 SBI_IS_FREEZING freefs is in process ====== ===================== ================================= What: /sys/fs/f2fs//ckpt_thread_ioprio diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 9a13902c7702..cba5eab24595 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -338,6 +338,7 @@ static char *s_flag[] = { [SBI_QUOTA_SKIP_FLUSH] = " quota_skip_flush", [SBI_QUOTA_NEED_REPAIR] = " quota_need_repair", [SBI_IS_RESIZEFS] = " resizefs", + [SBI_IS_FREEZING] = " freezefs", }; static int stat_show(struct seq_file *s, void *v) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a7509040d8fe..68f9fa5571aa 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1291,6 +1291,7 @@ enum { SBI_QUOTA_SKIP_FLUSH, /* skip flushing quota in current CP */ SBI_QUOTA_NEED_REPAIR, /* quota file may be corrupted */ SBI_IS_RESIZEFS, /* resizefs is in process */ + SBI_IS_FREEZING, /* freezefs is in process */ }; enum { diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 8f46a940871b..7b7758736859 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -778,7 +778,8 @@ void f2fs_evict_inode(struct inode *inode) f2fs_remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); f2fs_remove_ino_entry(sbi, inode->i_ino, FLUSH_INO); - sb_start_intwrite(inode->i_sb); + if (!is_sbi_flag_set(sbi, SBI_IS_FREEZING)) + sb_start_intwrite(inode->i_sb); set_inode_flag(inode, FI_NO_ALLOC); i_size_write(inode, 0); retry: @@ -809,7 +810,8 @@ void f2fs_evict_inode(struct inode *inode) if (dquot_initialize_needed(inode)) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); } - sb_end_intwrite(inode->i_sb); + if (!is_sbi_flag_set(sbi, SBI_IS_FREEZING)) + sb_end_intwrite(inode->i_sb); no_delete: dquot_drop(inode); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 2798587f9780..6537ef36498e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1665,11 +1665,15 @@ static int f2fs_freeze(struct super_block *sb) /* ensure no checkpoint required */ if (!llist_empty(&F2FS_SB(sb)->cprc_info.issue_list)) return -EINVAL; + + /* to avoid deadlock on f2fs_evict_inode->SB_FREEZE_FS */ + set_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING); return 0; } static int f2fs_unfreeze(struct super_block *sb) { + clear_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING); return 0; } From 9dc7944f728a4ed84bdf1893d60e8d61a2fe378a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 7 Mar 2022 17:27:46 -0800 Subject: [PATCH 383/580] f2fs: use spin_lock to avoid hang [14696.634553] task:cat state:D stack: 0 pid:1613738 ppid:1613735 flags:0x00000004 [14696.638285] Call Trace: [14696.639038] [14696.640032] __schedule+0x302/0x930 [14696.640969] schedule+0x58/0xd0 [14696.641799] schedule_preempt_disabled+0x18/0x30 [14696.642890] __mutex_lock.constprop.0+0x2fb/0x4f0 [14696.644035] ? mod_objcg_state+0x10c/0x310 [14696.645040] ? obj_cgroup_charge+0xe1/0x170 [14696.646067] __mutex_lock_slowpath+0x13/0x20 [14696.647126] mutex_lock+0x34/0x40 [14696.648070] stat_show+0x25/0x17c0 [f2fs] [14696.649218] seq_read_iter+0x120/0x4b0 [14696.650289] ? aa_file_perm+0x12a/0x500 [14696.651357] ? lru_cache_add+0x1c/0x20 [14696.652470] seq_read+0xfd/0x140 [14696.653445] full_proxy_read+0x5c/0x80 [14696.654535] vfs_read+0xa0/0x1a0 [14696.655497] ksys_read+0x67/0xe0 [14696.656502] __x64_sys_read+0x1a/0x20 [14696.657580] do_syscall_64+0x3b/0xc0 [14696.658671] entry_SYSCALL_64_after_hwframe+0x44/0xae [14696.660068] RIP: 0033:0x7efe39df1cb2 [14696.661133] RSP: 002b:00007ffc8badd948 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [14696.662958] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007efe39df1cb2 [14696.664757] RDX: 0000000000020000 RSI: 00007efe399df000 RDI: 0000000000000003 [14696.666542] RBP: 00007efe399df000 R08: 00007efe399de010 R09: 00007efe399de010 [14696.668363] R10: 0000000000000022 R11: 0000000000000246 R12: 0000000000000000 [14696.670155] R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000020000 [14696.671965] [14696.672826] task:umount state:D stack: 0 pid:1614985 ppid:1614984 flags:0x00004000 [14696.674930] Call Trace: [14696.675903] [14696.676780] __schedule+0x302/0x930 [14696.677927] schedule+0x58/0xd0 [14696.679019] schedule_preempt_disabled+0x18/0x30 [14696.680412] __mutex_lock.constprop.0+0x2fb/0x4f0 [14696.681783] ? destroy_inode+0x65/0x80 [14696.683006] __mutex_lock_slowpath+0x13/0x20 [14696.684305] mutex_lock+0x34/0x40 [14696.685442] f2fs_destroy_stats+0x1e/0x60 [f2fs] [14696.686803] f2fs_put_super+0x158/0x390 [f2fs] [14696.688238] generic_shutdown_super+0x7a/0x120 [14696.689621] kill_block_super+0x27/0x50 [14696.690894] kill_f2fs_super+0x7f/0x100 [f2fs] [14696.692311] deactivate_locked_super+0x35/0xa0 [14696.693698] deactivate_super+0x40/0x50 [14696.694985] cleanup_mnt+0x139/0x190 [14696.696209] __cleanup_mnt+0x12/0x20 [14696.697390] task_work_run+0x64/0xa0 [14696.698587] exit_to_user_mode_prepare+0x1b7/0x1c0 [14696.700053] syscall_exit_to_user_mode+0x27/0x50 [14696.701418] do_syscall_64+0x48/0xc0 [14696.702630] entry_SYSCALL_64_after_hwframe+0x44/0xae Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index cba5eab24595..6d26872c7364 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -21,7 +21,7 @@ #include "gc.h" static LIST_HEAD(f2fs_stat_list); -static DEFINE_MUTEX(f2fs_stat_mutex); +static DEFINE_RAW_SPINLOCK(f2fs_stat_lock); #ifdef CONFIG_DEBUG_FS static struct dentry *f2fs_debugfs_root; #endif @@ -345,8 +345,9 @@ static int stat_show(struct seq_file *s, void *v) { struct f2fs_stat_info *si; int i = 0, j = 0; + unsigned long flags; - mutex_lock(&f2fs_stat_mutex); + raw_spin_lock_irqsave(&f2fs_stat_lock, flags); list_for_each_entry(si, &f2fs_stat_list, stat_list) { update_general_status(si->sbi); @@ -577,7 +578,7 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - paged : %llu KB\n", si->page_mem >> 10); } - mutex_unlock(&f2fs_stat_mutex); + raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags); return 0; } @@ -588,6 +589,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_stat_info *si; + unsigned long flags; int i; si = f2fs_kzalloc(sbi, sizeof(struct f2fs_stat_info), GFP_KERNEL); @@ -623,9 +625,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->max_aw_cnt, 0); atomic_set(&sbi->max_vw_cnt, 0); - mutex_lock(&f2fs_stat_mutex); + raw_spin_lock_irqsave(&f2fs_stat_lock, flags); list_add_tail(&si->stat_list, &f2fs_stat_list); - mutex_unlock(&f2fs_stat_mutex); + raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags); return 0; } @@ -633,10 +635,11 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { struct f2fs_stat_info *si = F2FS_STAT(sbi); + unsigned long flags; - mutex_lock(&f2fs_stat_mutex); + raw_spin_lock_irqsave(&f2fs_stat_lock, flags); list_del(&si->stat_list); - mutex_unlock(&f2fs_stat_mutex); + raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags); kfree(si); } From 1a560ac44e1f1611ec8eb51ffc28f5171438ccfb Mon Sep 17 00:00:00 2001 From: Wang Xiaojun Date: Mon, 14 Mar 2022 15:15:15 +0800 Subject: [PATCH 384/580] f2fs: remove redundant parameter judgment iput() has already judged the incoming parameter, so there is no need to repeat the judgment here. Signed-off-by: Wang Xiaojun Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index b025055df431..a0604e949adc 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -1101,8 +1101,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, out_old: f2fs_put_page(old_page, 0); out: - if (whiteout) - iput(whiteout); + iput(whiteout); return err; } From c2b51637665656ea41142130a33e65becb6aba86 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 16 Mar 2022 18:20:00 +0800 Subject: [PATCH 385/580] f2fs: compress: fix to print raw data size in error path of lz4 decompression In lz4_decompress_pages(), if size of decompressed data is not equal to expected one, we should print the size rather than size of target buffer for decompressed data, fix it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index e13f3cb1eef2..4d8d51502ebb 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -314,10 +314,9 @@ static int lz4_decompress_pages(struct decompress_io_ctx *dic) } if (ret != PAGE_SIZE << dic->log_cluster_size) { - printk_ratelimited("%sF2FS-fs (%s): lz4 invalid rlen:%zu, " + printk_ratelimited("%sF2FS-fs (%s): lz4 invalid ret:%d, " "expected:%lu\n", KERN_ERR, - F2FS_I_SB(dic->inode)->sb->s_id, - dic->rlen, + F2FS_I_SB(dic->inode)->sb->s_id, ret, PAGE_SIZE << dic->log_cluster_size); return -EIO; } From f6011d156126b918a002befe4fa5a87d0db217d8 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Tue, 15 Mar 2022 21:14:14 -0700 Subject: [PATCH 386/580] f2fs: introduce gc_urgent_mid mode We need a mid level of gc urgent mode to do GC forcibly in a period of given gc_urgent_sleep_time, but not like using greedy GC approach and switching to SSR mode such as gc urgent high mode. This can be used for more aggressive periodic storage clean up. Signed-off-by: Daeho Jeong Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 17 +++++++++++------ fs/f2fs/debug.c | 4 +++- fs/f2fs/f2fs.h | 4 ++++ fs/f2fs/gc.c | 3 +++ fs/f2fs/sysfs.c | 7 +++++++ 5 files changed, 28 insertions(+), 7 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 5189dfebbadf..582b71796474 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -294,11 +294,16 @@ Description: Shows current reserved blocks in system, it may be temporarily What: /sys/fs/f2fs//gc_urgent Date: August 2017 Contact: "Jaegeuk Kim" -Description: Do background GC agressively when set. When gc_urgent = 1, - background thread starts to do GC by given gc_urgent_sleep_time - interval. When gc_urgent = 2, F2FS will lower the bar of - checking idle in order to process outstanding discard commands - and GC a little bit aggressively. It is set to 0 by default. +Description: Do background GC aggressively when set. Set to 0 by default. + gc urgent high(1): does GC forcibly in a period of given + gc_urgent_sleep_time and ignores I/O idling check. uses greedy + GC approach and turns SSR mode on. + gc urgent low(2): lowers the bar of checking I/O idling in + order to process outstanding discard commands and GC a + little bit aggressively. uses cost benefit GC approach. + gc urgent mid(3): does GC forcibly in a period of given + gc_urgent_sleep_time and executes a mid level of I/O idling check. + uses cost benefit GC approach. What: /sys/fs/f2fs//gc_urgent_sleep_time Date: August 2017 @@ -519,7 +524,7 @@ Date: July 2021 Contact: "Daeho Jeong" Description: Show how many segments have been reclaimed by GC during a specific GC mode (0: GC normal, 1: GC idle CB, 2: GC idle greedy, - 3: GC idle AT, 4: GC urgent high, 5: GC urgent low) + 3: GC idle AT, 4: GC urgent high, 5: GC urgent low 6: GC urgent mid) You can re-initialize this value to "0". What: /sys/fs/f2fs//gc_segment_mode diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 6d26872c7364..fcdf253cd211 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -476,12 +476,14 @@ static int stat_show(struct seq_file *s, void *v) si->node_segs, si->bg_node_segs); seq_printf(s, " - Reclaimed segs : Normal (%d), Idle CB (%d), " "Idle Greedy (%d), Idle AT (%d), " - "Urgent High (%d), Urgent Low (%d)\n", + "Urgent High (%d), Urgent Mid (%d), " + "Urgent Low (%d)\n", si->sbi->gc_reclaimed_segs[GC_NORMAL], si->sbi->gc_reclaimed_segs[GC_IDLE_CB], si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY], si->sbi->gc_reclaimed_segs[GC_IDLE_AT], si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH], + si->sbi->gc_reclaimed_segs[GC_URGENT_MID], si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]); seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks, si->bg_data_blks + si->bg_node_blks); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 68f9fa5571aa..c7aec44300dd 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1311,6 +1311,7 @@ enum { GC_IDLE_AT, GC_URGENT_HIGH, GC_URGENT_LOW, + GC_URGENT_MID, MAX_GC_MODE, }; @@ -2794,6 +2795,9 @@ static inline bool is_idle(struct f2fs_sb_info *sbi, int type) if (is_inflight_io(sbi, type)) return false; + if (sbi->gc_mode == GC_URGENT_MID) + return true; + if (sbi->gc_mode == GC_URGENT_LOW && (type == DISCARD_TIME || type == GC_TIME)) return true; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 8ab4b40c12c5..79dc38eacb19 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -103,7 +103,10 @@ static int gc_thread_func(void *data) sbi->gc_urgent_high_remaining--; } spin_unlock(&sbi->gc_urgent_high_lock); + } + if (sbi->gc_mode == GC_URGENT_HIGH || + sbi->gc_mode == GC_URGENT_MID) { wait_ms = gc_th->urgent_sleep_time; f2fs_down_write(&sbi->gc_lock); goto do_gc; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index dced8a2dc034..f1823031a0eb 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -472,6 +472,13 @@ static ssize_t __sbi_store(struct f2fs_attr *a, } } else if (t == 2) { sbi->gc_mode = GC_URGENT_LOW; + } else if (t == 3) { + sbi->gc_mode = GC_URGENT_MID; + if (sbi->gc_thread) { + sbi->gc_thread->gc_wake = 1; + wake_up_interruptible_all( + &sbi->gc_thread->gc_wait_queue_head); + } } else { return -EINVAL; } From 171a26f370362c6af78c4a5a153d2a88991df74d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 17 Mar 2022 16:33:15 +0800 Subject: [PATCH 387/580] f2fs: initialize sbi->gc_mode explicitly It needs to initialized sbi->gc_mode to GC_NORMAL explicitly. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 6537ef36498e..8c65eb79a541 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3556,6 +3556,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) F2FS_NODE_INO(sbi) = le32_to_cpu(raw_super->node_ino); F2FS_META_INO(sbi) = le32_to_cpu(raw_super->meta_ino); sbi->cur_victim_sec = NULL_SECNO; + sbi->gc_mode = GC_NORMAL; sbi->next_victim_seg[BG_GC] = NULL_SEGNO; sbi->next_victim_seg[FG_GC] = NULL_SEGNO; sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH; From 5a4ae443ba062b4897f2ef12f91c9da99b11d9a1 Mon Sep 17 00:00:00 2001 From: Fengnan Chang Date: Fri, 18 Mar 2022 09:23:04 +0800 Subject: [PATCH 388/580] f2fs: fix compressed file start atomic write may cause data corruption When compressed file has blocks, f2fs_ioc_start_atomic_write will succeed, but compressed flag will be remained in inode. If write partial compreseed cluster and commit atomic write will cause data corruption. This is the reproduction process: Step 1: create a compressed file ,write 64K data , call fsync(), then the blocks are write as compressed cluster. Step2: iotcl(F2FS_IOC_START_ATOMIC_WRITE) --- this should be fail, but not. write page 0 and page 3. iotcl(F2FS_IOC_COMMIT_ATOMIC_WRITE) -- page 0 and 3 write as normal file, Step3: drop cache. read page 0-4 -- Since page 0 has a valid block address, read as non-compressed cluster, page 1 and 2 will be filled with compressed data or zero. The root cause is, after commit 7eab7a696827 ("f2fs: compress: remove unneeded read when rewrite whole cluster"), in step 2, f2fs_write_begin() only set target page dirty, and in f2fs_commit_inmem_pages(), we will write partial raw pages into compressed cluster, result in corrupting compressed cluster layout. Fixes: 4c8ff7095bef ("f2fs: support data compression") Fixes: 7eab7a696827 ("f2fs: compress: remove unneeded read when rewrite whole cluster") Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Fengnan Chang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/file.c | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index c8dfee84f598..ab4c3be3ac7f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3418,7 +3418,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, *fsdata = NULL; - if (len == PAGE_SIZE) + if (len == PAGE_SIZE && !(f2fs_is_atomic_file(inode))) goto repeat; ret = f2fs_prepare_compress_overwrite(inode, pagep, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 4145466cf3d9..5c837ceb86bf 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2083,7 +2083,10 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) inode_lock(inode); - f2fs_disable_compressed_file(inode); + if (!f2fs_disable_compressed_file(inode)) { + ret = -EINVAL; + goto out; + } if (f2fs_is_atomic_file(inode)) { if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) From 5ff2f629465b82d7fb1ba3123a1a6ce13a260644 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 18 Mar 2022 17:02:30 +0800 Subject: [PATCH 389/580] f2fs: use aggressive GC policy during f2fs_disable_checkpoint() Let's enable GC_URGENT_HIGH mode during f2fs_disable_checkpoint(), so that we can use SSR allocator for GCed data/node persistence, it can improve the performance due to it avoiding migration of data/node locates in selected target segment of SSR allocator. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8c65eb79a541..6306021059e0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2075,6 +2075,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) { unsigned int s_flags = sbi->sb->s_flags; struct cp_control cpc; + unsigned int gc_mode; int err = 0; int ret; block_t unusable; @@ -2087,6 +2088,9 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) f2fs_update_time(sbi, DISABLE_TIME); + gc_mode = sbi->gc_mode; + sbi->gc_mode = GC_URGENT_HIGH; + while (!f2fs_time_over(sbi, DISABLE_TIME)) { f2fs_down_write(&sbi->gc_lock); err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); @@ -2124,6 +2128,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) out_unlock: f2fs_up_write(&sbi->gc_lock); restore_flag: + sbi->gc_mode = gc_mode; sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ return err; } From 6df080eaa5cbaa8eadc2be165da829e567f4b3e6 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Fri, 18 Mar 2022 12:13:23 -0700 Subject: [PATCH 390/580] f2fs: make gc_urgent and gc_segment_mode sysfs node readable Changed a way of showing values of them to use strings. Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index f1823031a0eb..f54d0d049d0e 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -41,6 +41,16 @@ enum { ATGC_INFO, /* struct atgc_management */ }; +static const char *gc_mode_names[MAX_GC_MODE] = { + "GC_NORMAL", + "GC_IDLE_CB", + "GC_IDLE_GREEDY", + "GC_IDLE_AT", + "GC_URGENT_HIGH", + "GC_URGENT_LOW", + "GC_URGENT_MID" +}; + struct f2fs_attr { struct attribute attr; ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *); @@ -320,8 +330,13 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, sbi->compr_new_inode); #endif + if (!strcmp(a->attr.name, "gc_urgent")) + return snprintf(buf, PAGE_SIZE, "%s\n", + gc_mode_names[sbi->gc_mode]); + if (!strcmp(a->attr.name, "gc_segment_mode")) - return snprintf(buf, PAGE_SIZE, "%u\n", sbi->gc_segment_mode); + return snprintf(buf, PAGE_SIZE, "%s\n", + gc_mode_names[sbi->gc_segment_mode]); if (!strcmp(a->attr.name, "gc_reclaimed_segments")) { return snprintf(buf, PAGE_SIZE, "%u\n", From df24d56fe2bb58acf0dc20cdb38ffe7b18e7f002 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 21 Mar 2022 23:22:11 +0800 Subject: [PATCH 391/580] f2fs: fix to do sanity check on .cp_pack_total_block_count As bughunter reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215709 f2fs may hang when mounting a fuzzed image, the dmesg shows as below: __filemap_get_folio+0x3a9/0x590 pagecache_get_page+0x18/0x60 __get_meta_page+0x95/0x460 [f2fs] get_checkpoint_version+0x2a/0x1e0 [f2fs] validate_checkpoint+0x8e/0x2a0 [f2fs] f2fs_get_valid_checkpoint+0xd0/0x620 [f2fs] f2fs_fill_super+0xc01/0x1d40 [f2fs] mount_bdev+0x18a/0x1c0 f2fs_mount+0x15/0x20 [f2fs] legacy_get_tree+0x28/0x50 vfs_get_tree+0x27/0xc0 path_mount+0x480/0xaa0 do_mount+0x7c/0xa0 __x64_sys_mount+0x8b/0xe0 do_syscall_64+0x38/0xc0 entry_SYSCALL_64_after_hwframe+0x44/0xae The root cause is cp_pack_total_block_count field in checkpoint was fuzzed to one, as calcuated, two cp pack block locates in the same block address, so then read latter cp pack block, it will block on the page lock due to the lock has already held when reading previous cp pack block, fix it by adding sanity check for cp_pack_total_block_count. Cc: stable@vger.kernel.org Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index dea5b5c57bd8..bd2a0f5854aa 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -875,6 +875,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, struct page *cp_page_1 = NULL, *cp_page_2 = NULL; struct f2fs_checkpoint *cp_block = NULL; unsigned long long cur_version = 0, pre_version = 0; + unsigned int cp_blocks; int err; err = get_checkpoint_version(sbi, cp_addr, &cp_block, @@ -882,15 +883,16 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, if (err) return NULL; - if (le32_to_cpu(cp_block->cp_pack_total_block_count) > - sbi->blocks_per_seg) { + cp_blocks = le32_to_cpu(cp_block->cp_pack_total_block_count); + + if (cp_blocks > sbi->blocks_per_seg || cp_blocks <= F2FS_CP_PACKS) { f2fs_warn(sbi, "invalid cp_pack_total_block_count:%u", le32_to_cpu(cp_block->cp_pack_total_block_count)); goto invalid_cp; } pre_version = *version; - cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1; + cp_addr += cp_blocks - 1; err = get_checkpoint_version(sbi, cp_addr, &cp_block, &cp_page_2, version); if (err) From 09ececf336ff05339aff9b8a0d9baec6d43bffaf Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 22 Mar 2022 14:39:13 -0700 Subject: [PATCH 392/580] f2fs: replace congestion_wait() calls with io_schedule_timeout() As congestion is no longer tracked, congestion_wait() is effectively equivalent to io_schedule_timeout(). So introduce f2fs_io_schedule_timeout() which sets TASK_UNINTERRUPTIBLE and call that instead. Link: https://lkml.kernel.org/r/164549983744.9187.6425865370954230902.stgit@noble.brown Signed-off-by: NeilBrown Cc: Anna Schumaker Cc: Chao Yu Cc: Darrick J. Wong Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jan Kara Cc: Jeff Layton Cc: Jens Axboe Cc: Lars Ellenberg Cc: Miklos Szeredi Cc: Paolo Valente Cc: Philipp Reisner Cc: Ryusuke Konishi Cc: Trond Myklebust Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/f2fs/compress.c | 4 +--- fs/f2fs/data.c | 3 +-- fs/f2fs/f2fs.h | 6 ++++++ fs/f2fs/segment.c | 8 +++----- fs/f2fs/super.c | 6 ++---- 5 files changed, 13 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 4d8d51502ebb..e0409daf1ff0 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1474,9 +1474,7 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc, if (IS_NOQUOTA(cc->inode)) return 0; ret = 0; - cond_resched(); - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); goto retry_write; } return ret; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ab4c3be3ac7f..3edc48bc4aae 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3108,8 +3108,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, } else if (ret == -EAGAIN) { ret = 0; if (wbc->sync_mode == WB_SYNC_ALL) { - cond_resched(); - congestion_wait(BLK_RW_ASYNC, + f2fs_io_schedule_timeout( DEFAULT_IO_TIMEOUT); goto retry_write; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c7aec44300dd..303464bc293c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4587,6 +4587,12 @@ static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi) return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK; } +static inline void f2fs_io_schedule_timeout(long timeout) +{ + set_current_state(TASK_UNINTERRUPTIBLE); + io_schedule_timeout(timeout); +} + #define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 95041d42d30b..04442f07388c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -314,8 +314,7 @@ void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure) skip: iput(inode); } - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); - cond_resched(); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); if (gc_failure) { if (++looped >= count) return; @@ -806,8 +805,7 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi) do { ret = __submit_flush_wait(sbi, FDEV(i).bdev); if (ret) - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); } while (ret && --count); if (ret) { @@ -3139,7 +3137,7 @@ static unsigned int __issue_discard_cmd_range(struct f2fs_sb_info *sbi, blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); trimmed += __wait_all_discard_cmd(sbi, NULL); - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); goto next; } skip: diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 6306021059e0..71dfe69007eb 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2140,8 +2140,7 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) /* we should flush all the data to keep data consistency */ do { sync_inodes_sb(sbi->sb); - cond_resched(); - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); } while (get_pages(sbi, F2FS_DIRTY_DATA) && retry--); if (unlikely(retry < 0)) @@ -2510,8 +2509,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, &page, &fsdata); if (unlikely(err)) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); goto retry; } set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); From d16185b45e8cb88bd37555d787dff175892340af Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 28 Feb 2022 14:41:22 +0200 Subject: [PATCH 393/580] f2fs: don't pass a bio to f2fs_target_device Set the bdev at bio allocation time by changing the f2fs_target_device calling conventions, so that no bio needs to be passed in. Signed-off-by: Christoph Hellwig Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20220228124123.856027-2-hch@lst.de Signed-off-by: Jens Axboe --- fs/f2fs/data.c | 21 +++++++++++++-------- fs/f2fs/f2fs.h | 2 +- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 3edc48bc4aae..f15e8080999c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -352,7 +352,7 @@ static void f2fs_write_end_io(struct bio *bio) } struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, - block_t blk_addr, struct bio *bio) + block_t blk_addr, sector_t *sector) { struct block_device *bdev = sbi->sb->s_bdev; int i; @@ -367,10 +367,9 @@ struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, } } } - if (bio) { - bio_set_dev(bio, bdev); - bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr); - } + + if (sector) + *sector = SECTOR_FROM_BLOCK(blk_addr); return bdev; } @@ -400,11 +399,14 @@ static bool __same_bdev(struct f2fs_sb_info *sbi, static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) { struct f2fs_sb_info *sbi = fio->sbi; + struct block_device *bdev; + sector_t sector; struct bio *bio; + bdev = f2fs_target_device(sbi, fio->new_blkaddr, §or); bio = bio_alloc_bioset(GFP_NOIO, npages, &f2fs_bioset); - - f2fs_target_device(sbi, fio->new_blkaddr, bio); + bio_set_dev(bio, bdev); + bio->bi_iter.bi_sector = sector; if (is_read_io(fio->op)) { bio->bi_end_io = f2fs_read_end_io; bio->bi_private = NULL; @@ -948,13 +950,16 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, struct bio *bio; struct bio_post_read_ctx *ctx = NULL; unsigned int post_read_steps = 0; + sector_t sector; + struct block_device *bdev = f2fs_target_device(sbi, blkaddr, §or); bio = bio_alloc_bioset(for_write ? GFP_NOIO : GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES), &f2fs_bioset); + bio_set_dev(bio, bdev); if (!bio) return ERR_PTR(-ENOMEM); - f2fs_target_device(sbi, blkaddr, bio); + bio->bi_iter.bi_sector = sector; bio->bi_end_io = f2fs_read_end_io; bio_set_op_attrs(bio, REQ_OP_READ, op_flag); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 303464bc293c..79eb2ee8f190 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3778,7 +3778,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio); int f2fs_merge_page_bio(struct f2fs_io_info *fio); void f2fs_submit_page_write(struct f2fs_io_info *fio); struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, - block_t blk_addr, struct bio *bio); + block_t blk_addr, sector_t *sector); int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr); void f2fs_set_data_blkaddr(struct dnode_of_data *dn); void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr); From 9f4c08fb4f9b26f35f1f639b11378e3c0580f17a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 28 Feb 2022 14:41:23 +0200 Subject: [PATCH 394/580] f2fs: pass the bio operation to bio_alloc_bioset Refactor block I/O code so that the bio operation and known flags are set at bio allocation time. Only the later updated flags are updated on the fly. Signed-off-by: Christoph Hellwig Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20220228124123.856027-3-hch@lst.de Signed-off-by: Jens Axboe --- fs/f2fs/data.c | 67 ++++++++++++++++++++------------------------------ 1 file changed, 27 insertions(+), 40 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f15e8080999c..1093dc24475c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -396,6 +396,24 @@ static bool __same_bdev(struct f2fs_sb_info *sbi, return bio->bi_disk == b->bd_disk && bio->bi_partno == b->bd_partno; } +static void __attach_io_flag(struct f2fs_io_info *fio, unsigned int io_flag) +{ + unsigned int temp_mask = (1 << NR_TEMP_TYPE) - 1; + unsigned int fua_flag = io_flag & temp_mask; + unsigned int meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask; + + /* + * data/node io flag bits per temp: + * REQ_META | REQ_FUA | + * 5 | 4 | 3 | 2 | 1 | 0 | + * Cold | Warm | Hot | Cold | Warm | Hot | + */ + if ((1 << fio->temp) & meta_flag) + fio->op_flags |= REQ_META; + if ((1 << fio->temp) & fua_flag) + fio->op_flags |= REQ_FUA; +} + static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) { struct f2fs_sb_info *sbi = fio->sbi; @@ -403,9 +421,15 @@ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) sector_t sector; struct bio *bio; + if (fio->type == DATA) + __attach_io_flag(fio, sbi->data_io_flag); + else if (fio->type == NODE) + __attach_io_flag(fio, sbi->node_io_flag); + bdev = f2fs_target_device(sbi, fio->new_blkaddr, §or); bio = bio_alloc_bioset(GFP_NOIO, npages, &f2fs_bioset); bio_set_dev(bio, bdev); + bio_set_op_attrs(bio, fio->op, fio->op_flags); bio->bi_iter.bi_sector = sector; if (is_read_io(fio->op)) { bio->bi_end_io = f2fs_read_end_io; @@ -483,34 +507,6 @@ void f2fs_submit_bio(struct f2fs_sb_info *sbi, __submit_bio(sbi, bio, type); } -static void __attach_io_flag(struct f2fs_io_info *fio) -{ - struct f2fs_sb_info *sbi = fio->sbi; - unsigned int temp_mask = (1 << NR_TEMP_TYPE) - 1; - unsigned int io_flag, fua_flag, meta_flag; - - if (fio->type == DATA) - io_flag = sbi->data_io_flag; - else if (fio->type == NODE) - io_flag = sbi->node_io_flag; - else - return; - - fua_flag = io_flag & temp_mask; - meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask; - - /* - * data/node io flag bits per temp: - * REQ_META | REQ_FUA | - * 5 | 4 | 3 | 2 | 1 | 0 | - * Cold | Warm | Hot | Cold | Warm | Hot | - */ - if ((1 << fio->temp) & meta_flag) - fio->op_flags |= REQ_META; - if ((1 << fio->temp) & fua_flag) - fio->op_flags |= REQ_FUA; -} - static void __submit_merged_bio(struct f2fs_bio_info *io) { struct f2fs_io_info *fio = &io->fio; @@ -518,9 +514,6 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) if (!io->bio) return; - __attach_io_flag(fio); - bio_set_op_attrs(io->bio, fio->op, fio->op_flags); - if (is_read_io(fio->op)) trace_f2fs_prepare_read_bio(io->sbi->sb, fio->type, io->bio); else @@ -578,10 +571,9 @@ static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { io->fio.type = META_FLUSH; - io->fio.op = REQ_OP_WRITE; - io->fio.op_flags = REQ_META | REQ_PRIO | REQ_SYNC; + io->bio->bi_opf |= REQ_META | REQ_PRIO | REQ_SYNC; if (!test_opt(sbi, NOBARRIER)) - io->fio.op_flags |= REQ_PREFLUSH | REQ_FUA; + io->bio->bi_opf |= REQ_PREFLUSH | REQ_FUA; } __submit_merged_bio(io); f2fs_up_write(&io->io_rwsem); @@ -659,9 +651,6 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) if (fio->io_wbc && !is_read_io(fio->op)) wbc_account_io(fio->io_wbc, page, PAGE_SIZE); - __attach_io_flag(fio); - bio_set_op_attrs(bio, fio->op, fio->op_flags); - inc_page_count(fio->sbi, is_read_io(fio->op) ? __read_io_type(page): WB_DATA_TYPE(fio->page)); @@ -848,8 +837,6 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) alloc_new: if (!bio) { bio = __bio_alloc(fio, BIO_MAX_PAGES); - __attach_io_flag(fio); - bio_set_op_attrs(bio, fio->op, fio->op_flags); add_bio_entry(fio->sbi, bio, page, fio->temp); } else { @@ -957,11 +944,11 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, min_t(int, nr_pages, BIO_MAX_PAGES), &f2fs_bioset); bio_set_dev(bio, bdev); + bio_set_op_attrs(bio, REQ_OP_READ, op_flag); if (!bio) return ERR_PTR(-ENOMEM); bio->bi_iter.bi_sector = sector; bio->bi_end_io = f2fs_read_end_io; - bio_set_op_attrs(bio, REQ_OP_READ, op_flag); if (f2fs_encrypted_file(inode)) post_read_steps |= STEP_DECRYPT; From b3a8832c8e5fc73323d9b2fed64b1ba6085d1960 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 12 Apr 2022 14:45:50 -0700 Subject: [PATCH 395/580] f2fs: remove obsolete whint_mode This patch removes obsolete whint_mode. Fixes: 41d36a9f3e53 ("fs: remove kiocb.ki_hint") Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 70 ---------------------- fs/f2fs/data.c | 11 ---- fs/f2fs/f2fs.h | 9 --- fs/f2fs/segment.c | 95 ------------------------------ fs/f2fs/super.c | 32 +--------- 5 files changed, 1 insertion(+), 216 deletions(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 5fda320354a6..d294d04c06d0 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -235,12 +235,6 @@ offgrpjquota Turn off group journalled quota. offprjjquota Turn off project journalled quota. quota Enable plain user disk quota accounting. noquota Disable all plain disk quota option. -whint_mode=%s Control which write hints are passed down to block - layer. This supports "off", "user-based", and - "fs-based". In "off" mode (default), f2fs does not pass - down hints. In "user-based" mode, f2fs tries to pass - down hints given by users. And in "fs-based" mode, f2fs - passes down hints with its policy. alloc_mode=%s Adjust block allocation policy, which supports "reuse" and "default". fsync_mode=%s Control the policy of fsync. Currently supports "posix", @@ -751,70 +745,6 @@ In order to identify whether the data in the victim segment are valid or not, F2FS manages a bitmap. Each bit represents the validity of a block, and the bitmap is composed of a bit stream covering whole blocks in main area. -Write-hint Policy ------------------ - -1) whint_mode=off. F2FS only passes down WRITE_LIFE_NOT_SET. - -2) whint_mode=user-based. F2FS tries to pass down hints given by -users. - -===================== ======================== =================== -User F2FS Block -===================== ======================== =================== -N/A META WRITE_LIFE_NOT_SET -N/A HOT_NODE " -N/A WARM_NODE " -N/A COLD_NODE " -ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME -extension list " " - --- buffered io -WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME -WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT -WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET -WRITE_LIFE_NONE " " -WRITE_LIFE_MEDIUM " " -WRITE_LIFE_LONG " " - --- direct io -WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME -WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT -WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET -WRITE_LIFE_NONE " WRITE_LIFE_NONE -WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM -WRITE_LIFE_LONG " WRITE_LIFE_LONG -===================== ======================== =================== - -3) whint_mode=fs-based. F2FS passes down hints with its policy. - -===================== ======================== =================== -User F2FS Block -===================== ======================== =================== -N/A META WRITE_LIFE_MEDIUM; -N/A HOT_NODE WRITE_LIFE_NOT_SET -N/A WARM_NODE " -N/A COLD_NODE WRITE_LIFE_NONE -ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME -extension list " " - --- buffered io -WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME -WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT -WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_LONG -WRITE_LIFE_NONE " " -WRITE_LIFE_MEDIUM " " -WRITE_LIFE_LONG " " - --- direct io -WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME -WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT -WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET -WRITE_LIFE_NONE " WRITE_LIFE_NONE -WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM -WRITE_LIFE_LONG " WRITE_LIFE_LONG -===================== ======================== =================== - Fallocate(2) Policy ------------------- diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 1093dc24475c..5d3953e59ef8 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -437,8 +437,6 @@ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) } else { bio->bi_end_io = f2fs_write_end_io; bio->bi_private = sbi; - bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi, - fio->type, fio->temp); } iostat_alloc_and_bind_ctx(sbi, bio, NULL); @@ -3625,8 +3623,6 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) loff_t offset = iocb->ki_pos; int rw = iov_iter_rw(iter); int err; - enum rw_hint hint = iocb->ki_hint; - int whint_mode = F2FS_OPTION(sbi).whint_mode; bool do_opu; err = check_direct_IO(inode, iter, offset); @@ -3640,18 +3636,13 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) trace_f2fs_direct_IO_enter(inode, offset, count, rw); - if (rw == WRITE && whint_mode == WHINT_MODE_OFF) - iocb->ki_hint = WRITE_LIFE_NOT_SET; - if (iocb->ki_flags & IOCB_NOWAIT) { if (!f2fs_down_read_trylock(&fi->i_gc_rwsem[rw])) { - iocb->ki_hint = hint; err = -EAGAIN; goto out; } if (do_opu && !f2fs_down_read_trylock(&fi->i_gc_rwsem[READ])) { f2fs_up_read(&fi->i_gc_rwsem[rw]); - iocb->ki_hint = hint; err = -EAGAIN; goto out; } @@ -3673,8 +3664,6 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) f2fs_up_read(&fi->i_gc_rwsem[rw]); if (rw == WRITE) { - if (whint_mode == WHINT_MODE_OFF) - iocb->ki_hint = hint; if (err > 0) { f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO, err); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 79eb2ee8f190..87b7453f043f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -151,7 +151,6 @@ struct f2fs_mount_info { int s_jquota_fmt; /* Format of quota to use */ #endif /* For which write hints are passed down to block layer */ - int whint_mode; int alloc_mode; /* segment allocation policy */ int fsync_mode; /* fsync policy */ int fs_mode; /* fs mode: LFS or ADAPTIVE */ @@ -1331,12 +1330,6 @@ enum { FS_MODE_FRAGMENT_BLK, /* block fragmentation mode */ }; -enum { - WHINT_MODE_OFF, /* not pass down write hints */ - WHINT_MODE_USER, /* try to pass down hints given by users */ - WHINT_MODE_FS, /* pass down hints with F2FS policy */ -}; - enum { ALLOC_MODE_DEFAULT, /* stay default */ ALLOC_MODE_REUSE, /* reuse segments as much as possible */ @@ -3696,8 +3689,6 @@ void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi); int __init f2fs_create_segment_manager_caches(void); void f2fs_destroy_segment_manager_caches(void); int f2fs_rw_hint_to_seg_type(enum rw_hint hint); -enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, - enum page_type type, enum temp_type temp); unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi, unsigned int segno); unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 04442f07388c..b803c9a77cf6 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3245,101 +3245,6 @@ int f2fs_rw_hint_to_seg_type(enum rw_hint hint) } } -/* This returns write hints for each segment type. This hints will be - * passed down to block layer. There are mapping tables which depend on - * the mount option 'whint_mode'. - * - * 1) whint_mode=off. F2FS only passes down WRITE_LIFE_NOT_SET. - * - * 2) whint_mode=user-based. F2FS tries to pass down hints given by users. - * - * User F2FS Block - * ---- ---- ----- - * META WRITE_LIFE_NOT_SET - * HOT_NODE " - * WARM_NODE " - * COLD_NODE " - * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME - * extension list " " - * - * -- buffered io - * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME - * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT - * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET - * WRITE_LIFE_NONE " " - * WRITE_LIFE_MEDIUM " " - * WRITE_LIFE_LONG " " - * - * -- direct io - * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME - * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT - * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET - * WRITE_LIFE_NONE " WRITE_LIFE_NONE - * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM - * WRITE_LIFE_LONG " WRITE_LIFE_LONG - * - * 3) whint_mode=fs-based. F2FS passes down hints with its policy. - * - * User F2FS Block - * ---- ---- ----- - * META WRITE_LIFE_MEDIUM; - * HOT_NODE WRITE_LIFE_NOT_SET - * WARM_NODE " - * COLD_NODE WRITE_LIFE_NONE - * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME - * extension list " " - * - * -- buffered io - * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME - * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT - * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_LONG - * WRITE_LIFE_NONE " " - * WRITE_LIFE_MEDIUM " " - * WRITE_LIFE_LONG " " - * - * -- direct io - * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME - * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT - * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET - * WRITE_LIFE_NONE " WRITE_LIFE_NONE - * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM - * WRITE_LIFE_LONG " WRITE_LIFE_LONG - */ - -enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, - enum page_type type, enum temp_type temp) -{ - if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) { - if (type == DATA) { - if (temp == WARM) - return WRITE_LIFE_NOT_SET; - else if (temp == HOT) - return WRITE_LIFE_SHORT; - else if (temp == COLD) - return WRITE_LIFE_EXTREME; - } else { - return WRITE_LIFE_NOT_SET; - } - } else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) { - if (type == DATA) { - if (temp == WARM) - return WRITE_LIFE_LONG; - else if (temp == HOT) - return WRITE_LIFE_SHORT; - else if (temp == COLD) - return WRITE_LIFE_EXTREME; - } else if (type == NODE) { - if (temp == WARM || temp == HOT) - return WRITE_LIFE_NOT_SET; - else if (temp == COLD) - return WRITE_LIFE_NONE; - } else if (type == META) { - return WRITE_LIFE_MEDIUM; - } - } - return WRITE_LIFE_NOT_SET; -} - static int __get_segment_type_2(struct f2fs_io_info *fio) { if (fio->type == DATA) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 71dfe69007eb..ae17ed862c01 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -137,7 +137,6 @@ enum { Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, - Opt_whint, Opt_alloc, Opt_fsync, Opt_test_dummy_encryption, @@ -212,7 +211,6 @@ static match_table_t f2fs_tokens = { {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, - {Opt_whint, "whint_mode=%s"}, {Opt_alloc, "alloc_mode=%s"}, {Opt_fsync, "fsync_mode=%s"}, {Opt_test_dummy_encryption, "test_dummy_encryption=%s"}, @@ -979,22 +977,6 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) f2fs_info(sbi, "quota operations not supported"); break; #endif - case Opt_whint: - name = match_strdup(&args[0]); - if (!name) - return -ENOMEM; - if (!strcmp(name, "user-based")) { - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_USER; - } else if (!strcmp(name, "off")) { - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; - } else if (!strcmp(name, "fs-based")) { - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_FS; - } else { - kfree(name); - return -EINVAL; - } - kfree(name); - break; case Opt_alloc: name = match_strdup(&args[0]); if (!name) @@ -1317,12 +1299,6 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) return -EINVAL; } - /* Not pass down write hints if the number of active logs is lesser - * than NR_CURSEG_PERSIST_TYPE. - */ - if (F2FS_OPTION(sbi).active_logs != NR_CURSEG_PERSIST_TYPE) - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; - if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) { f2fs_err(sbi, "Allow to mount readonly mode only"); return -EROFS; @@ -1974,10 +1950,6 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",prjquota"); #endif f2fs_show_quota_options(seq, sbi->sb); - if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) - seq_printf(seq, ",whint_mode=%s", "user-based"); - else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) - seq_printf(seq, ",whint_mode=%s", "fs-based"); fscrypt_show_test_dummy_encryption(seq, ',', sbi->sb); @@ -2026,7 +1998,6 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE; F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); @@ -2305,8 +2276,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) need_stop_gc = true; } - if (*flags & SB_RDONLY || - F2FS_OPTION(sbi).whint_mode != org_mount_opt.whint_mode) { + if (*flags & SB_RDONLY) { sync_inodes_sb(sb); set_sbi_flag(sbi, SBI_IS_DIRTY); From 95587e087576bc784c2738cf11f5436c72c31226 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 12 Apr 2022 15:01:58 -0700 Subject: [PATCH 396/580] f2fs: keep io_flags to avoid IO split due to different op_flags in two fio holders Let's attach io_flags to bio only, so that we can merge IOs given original io_flags only. Fixes: 64bf0eef0171 ("f2fs: pass the bio operation to bio_alloc_bioset") Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 5d3953e59ef8..d18299185fee 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -396,11 +396,23 @@ static bool __same_bdev(struct f2fs_sb_info *sbi, return bio->bi_disk == b->bd_disk && bio->bi_partno == b->bd_partno; } -static void __attach_io_flag(struct f2fs_io_info *fio, unsigned int io_flag) +static unsigned int f2fs_io_flags(struct f2fs_io_info *fio) { unsigned int temp_mask = (1 << NR_TEMP_TYPE) - 1; - unsigned int fua_flag = io_flag & temp_mask; - unsigned int meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask; + unsigned int fua_flag, meta_flag, io_flag; + unsigned int op_flags = 0; + + if (fio->op != REQ_OP_WRITE) + return 0; + if (fio->type == DATA) + io_flag = fio->sbi->data_io_flag; + else if (fio->type == NODE) + io_flag = fio->sbi->node_io_flag; + else + return 0; + + fua_flag = io_flag & temp_mask; + meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask; /* * data/node io flag bits per temp: @@ -409,9 +421,10 @@ static void __attach_io_flag(struct f2fs_io_info *fio, unsigned int io_flag) * Cold | Warm | Hot | Cold | Warm | Hot | */ if ((1 << fio->temp) & meta_flag) - fio->op_flags |= REQ_META; + op_flags |= REQ_META; if ((1 << fio->temp) & fua_flag) - fio->op_flags |= REQ_FUA; + op_flags |= REQ_FUA; + return op_flags; } static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) @@ -421,15 +434,10 @@ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) sector_t sector; struct bio *bio; - if (fio->type == DATA) - __attach_io_flag(fio, sbi->data_io_flag); - else if (fio->type == NODE) - __attach_io_flag(fio, sbi->node_io_flag); - bdev = f2fs_target_device(sbi, fio->new_blkaddr, §or); bio = bio_alloc_bioset(GFP_NOIO, npages, &f2fs_bioset); bio_set_dev(bio, bdev); - bio_set_op_attrs(bio, fio->op, fio->op_flags); + bio_set_op_attrs(bio, fio->op, fio->op_flags | f2fs_io_flags(fio)); bio->bi_iter.bi_sector = sector; if (is_read_io(fio->op)) { bio->bi_end_io = f2fs_read_end_io; From 59cfa13f04698e71e9b9e398ecb38100e05b275b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 29 Mar 2022 11:28:07 -0700 Subject: [PATCH 397/580] f2fs: fix wrong condition check when failing metapage read This patch fixes wrong initialization. Fixes: 50c63009f6ab ("f2fs: avoid an infinite loop in f2fs_sync_dirty_inodes") Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index bd2a0f5854aa..13ee979b65cf 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -98,9 +98,9 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, } if (unlikely(!PageUptodate(page))) { - if (page->index == sbi->metapage_eio_ofs && - sbi->metapage_eio_cnt++ == MAX_RETRY_META_PAGE_EIO) { - set_ckpt_flags(sbi, CP_ERROR_FLAG); + if (page->index == sbi->metapage_eio_ofs) { + if (sbi->metapage_eio_cnt++ == MAX_RETRY_META_PAGE_EIO) + set_ckpt_flags(sbi, CP_ERROR_FLAG); } else { sbi->metapage_eio_ofs = page->index; sbi->metapage_eio_cnt = 0; From 846d4c2b5c78126b09d6dec8cfd84db1e897b0e3 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 21 Apr 2022 16:47:02 -0700 Subject: [PATCH 398/580] f2fs: should not truncate blocks during roll-forward recovery If the file preallocated blocks and fsync'ed, we should not truncate them during roll-forward recovery which will recover i_size correctly back. Fixes: d4dd19ec1ea0 ("f2fs: do not expose unwritten blocks to user by DIO") Cc: # 5.17+ Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 7b7758736859..c76b2153455f 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -550,7 +550,8 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) } f2fs_set_inode_flags(inode); - if (file_should_truncate(inode)) { + if (file_should_truncate(inode) && + !is_sbi_flag_set(sbi, SBI_POR_DOING)) { ret = f2fs_truncate(inode); if (ret) goto bad_inode; From 656b39cf91dff3ea6567c65aad440252282fad1e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 20 Mar 2022 23:11:17 +0800 Subject: [PATCH 399/580] f2fs: check pinfile in gc_data_segment() in advance In order to skip migrating section which contains data of pinned file in advance. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 79dc38eacb19..e2c9555f8bd6 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1481,6 +1481,13 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, special_file(inode->i_mode)) continue; + if (is_inode_flag_set(inode, FI_PIN_FILE) && + gc_type == FG_GC) { + f2fs_pin_file_control(inode, true); + iput(inode); + return submitted; + } + if (!f2fs_down_write_trylock( &F2FS_I(inode)->i_gc_rwsem[WRITE])) { iput(inode); From a7b07907a85ec06422837dbec3e9371499212e0b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 20 Mar 2022 23:11:18 +0800 Subject: [PATCH 400/580] f2fs: don't set GC_FAILURE_PIN for background GC So that it can reduce the possibility that file be unpinned forcely by foreground GC due to .i_gc_failures[GC_FAILURE_PIN] exceeds threshold. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index e2c9555f8bd6..79ea591f176c 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1202,7 +1202,8 @@ static int move_data_block(struct inode *inode, block_t bidx, } if (f2fs_is_pinned_file(inode)) { - f2fs_pin_file_control(inode, true); + if (gc_type == FG_GC) + f2fs_pin_file_control(inode, true); err = -EAGAIN; goto out; } From 27a25a92ec185897f19807845886c9567020e3aa Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 23 Mar 2022 13:30:04 -0700 Subject: [PATCH 401/580] f2fs: remove unnecessary f2fs_lock_op in f2fs_new_inode This can be removed, since f2fs_alloc_nid() actually doesn't require to block checkpoint and __f2fs_build_free_nids() is covered by nm_i->nat_tree_lock. Suggested-by: Linus Torvalds Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a0604e949adc..90c632ba4983 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -35,13 +35,10 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (!inode) return ERR_PTR(-ENOMEM); - f2fs_lock_op(sbi); if (!f2fs_alloc_nid(sbi, &ino)) { - f2fs_unlock_op(sbi); err = -ENOSPC; goto fail; } - f2fs_unlock_op(sbi); nid_free = true; From 9547bec2fdb380193ef0978a986a4fbf56fb85bf Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 21 Mar 2022 15:13:06 -0700 Subject: [PATCH 402/580] f2fs: introduce data read/write showing path info This was used in Android for a long time. Let's upstream it. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 47 ++++++++++++++++++- include/trace/events/f2fs.h | 94 +++++++++++++++++++++++++++++++++++++ 2 files changed, 139 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5c837ceb86bf..b910920e12a5 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4384,11 +4384,32 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) if (!f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; + if (trace_f2fs_dataread_start_enabled()) { + char *p = f2fs_kmalloc(F2FS_I_SB(inode), PATH_MAX, GFP_KERNEL); + char *path; + + if (!p) + goto skip_read_trace; + + path = dentry_path_raw(file_dentry(iocb->ki_filp), p, PATH_MAX); + if (IS_ERR(path)) { + kfree(p); + goto skip_read_trace; + } + + trace_f2fs_dataread_start(inode, iocb->ki_pos, + iov_iter_count(iter), + current->pid, path, current->comm); + kfree(p); + } +skip_read_trace: ret = generic_file_read_iter(iocb, iter); if (ret > 0) f2fs_update_iostat(F2FS_I_SB(inode), APP_READ_IO, ret); + if (trace_f2fs_dataread_end_enabled()) + trace_f2fs_dataread_end(inode, iocb->ki_pos, ret); return ret; } @@ -4525,11 +4546,33 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) /* Possibly preallocate the blocks for the write. */ target_size = iocb->ki_pos + iov_iter_count(from); preallocated = f2fs_preallocate_blocks(iocb, from); - if (preallocated < 0) + if (preallocated < 0) { ret = preallocated; - else + } else { + if (trace_f2fs_datawrite_start_enabled()) { + char *p = f2fs_kmalloc(F2FS_I_SB(inode), + PATH_MAX, GFP_KERNEL); + char *path; + + if (!p) + goto skip_write_trace; + path = dentry_path_raw(file_dentry(iocb->ki_filp), + p, PATH_MAX); + if (IS_ERR(path)) { + kfree(p); + goto skip_write_trace; + } + trace_f2fs_datawrite_start(inode, orig_pos, orig_count, + current->pid, path, current->comm); + kfree(p); + } +skip_write_trace: ret = __generic_file_write_iter(iocb, from); + if (trace_f2fs_datawrite_end_enabled()) + trace_f2fs_datawrite_end(inode, orig_pos, ret); + } + /* Don't leave any preallocated blocks around past i_size. */ if (preallocated && i_size_read(inode) < target_size) { f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 2f0568950009..1bdf7c993f7b 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -2074,6 +2074,100 @@ TRACE_EVENT(f2fs_fiemap, __entry->ret) ); +DECLARE_EVENT_CLASS(f2fs__rw_start, + + TP_PROTO(struct inode *inode, loff_t offset, int bytes, + pid_t pid, char *pathname, char *command), + + TP_ARGS(inode, offset, bytes, pid, pathname, command), + + TP_STRUCT__entry( + __string(pathbuf, pathname) + __field(loff_t, offset) + __field(int, bytes) + __field(loff_t, i_size) + __string(cmdline, command) + __field(pid_t, pid) + __field(ino_t, ino) + ), + + TP_fast_assign( + /* + * Replace the spaces in filenames and cmdlines + * because this screws up the tooling that parses + * the traces. + */ + __assign_str(pathbuf, pathname); + (void)strreplace(__get_str(pathbuf), ' ', '_'); + __entry->offset = offset; + __entry->bytes = bytes; + __entry->i_size = i_size_read(inode); + __assign_str(cmdline, command); + (void)strreplace(__get_str(cmdline), ' ', '_'); + __entry->pid = pid; + __entry->ino = inode->i_ino; + ), + + TP_printk("entry_name %s, offset %llu, bytes %d, cmdline %s," + " pid %d, i_size %llu, ino %lu", + __get_str(pathbuf), __entry->offset, __entry->bytes, + __get_str(cmdline), __entry->pid, __entry->i_size, + (unsigned long) __entry->ino) +); + +DECLARE_EVENT_CLASS(f2fs__rw_end, + + TP_PROTO(struct inode *inode, loff_t offset, int bytes), + + TP_ARGS(inode, offset, bytes), + + TP_STRUCT__entry( + __field(ino_t, ino) + __field(loff_t, offset) + __field(int, bytes) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->offset = offset; + __entry->bytes = bytes; + ), + + TP_printk("ino %lu, offset %llu, bytes %d", + (unsigned long) __entry->ino, + __entry->offset, __entry->bytes) +); + +DEFINE_EVENT(f2fs__rw_start, f2fs_dataread_start, + + TP_PROTO(struct inode *inode, loff_t offset, int bytes, + pid_t pid, char *pathname, char *command), + + TP_ARGS(inode, offset, bytes, pid, pathname, command) +); + +DEFINE_EVENT(f2fs__rw_end, f2fs_dataread_end, + + TP_PROTO(struct inode *inode, loff_t offset, int bytes), + + TP_ARGS(inode, offset, bytes) +); + +DEFINE_EVENT(f2fs__rw_start, f2fs_datawrite_start, + + TP_PROTO(struct inode *inode, loff_t offset, int bytes, + pid_t pid, char *pathname, char *command), + + TP_ARGS(inode, offset, bytes, pid, pathname, command) +); + +DEFINE_EVENT(f2fs__rw_end, f2fs_datawrite_end, + + TP_PROTO(struct inode *inode, loff_t offset, int bytes), + + TP_ARGS(inode, offset, bytes) +); + #endif /* _TRACE_F2FS_H */ /* This part must be outside protection */ From c206b751f94f22669664e58782fbe16e00fe6557 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 29 Mar 2022 00:02:53 +0800 Subject: [PATCH 403/580] f2fs: fix to do sanity check on inline_dots inode As Wenqing reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215765 It will cause a kernel panic with steps: - mkdir mnt - mount tmp40.img mnt - ls mnt folio_mark_dirty+0x33/0x50 f2fs_add_regular_entry+0x541/0xad0 [f2fs] f2fs_add_dentry+0x6c/0xb0 [f2fs] f2fs_do_add_link+0x182/0x230 [f2fs] __recover_dot_dentries+0x2d6/0x470 [f2fs] f2fs_lookup+0x5af/0x6a0 [f2fs] __lookup_slow+0xac/0x200 lookup_slow+0x45/0x70 walk_component+0x16c/0x250 path_lookupat+0x8b/0x1f0 filename_lookup+0xef/0x250 user_path_at_empty+0x46/0x70 vfs_statx+0x98/0x190 __do_sys_newlstat+0x41/0x90 __x64_sys_newlstat+0x1a/0x30 do_syscall_64+0x37/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae The root cause is for special file: e.g. character, block, fifo or socket file, f2fs doesn't assign address space operations pointer array for mapping->a_ops field, so, in a fuzzed image, if inline_dots flag was tagged in special file, during lookup(), when f2fs runs into __recover_dot_dentries(), it will cause NULL pointer access once f2fs_add_regular_entry() calls a_ops->set_dirty_page(). Fixes: 510022a85839 ("f2fs: add F2FS_INLINE_DOTS to recover missing dot dentries") Reported-by: Wenqing Liu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 90c632ba4983..edfda2740a93 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -453,6 +453,13 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) return 0; } + if (!S_ISDIR(dir->i_mode)) { + f2fs_err(sbi, "inconsistent inode status, skip recovering inline_dots inode (ino:%lu, i_mode:%u, pino:%u)", + dir->i_ino, dir->i_mode, pino); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return -ENOTDIR; + } + err = f2fs_dquot_initialize(dir); if (err) return err; From f44fafa541dcb62bf975f6c6a896437534797693 Mon Sep 17 00:00:00 2001 From: Jakob Koschel Date: Fri, 1 Apr 2022 00:34:14 +0200 Subject: [PATCH 404/580] f2fs: fix dereference of stale list iterator after loop body The list iterator variable will be a bogus pointer if no break was hit. Dereferencing it (cur->page in this case) could load an out-of-bounds/undefined value making it unsafe to use that in the comparision to determine if the specific element was found. Since 'cur->page' *can* be out-ouf-bounds it cannot be guaranteed that by chance (or intention of an attacker) it matches the value of 'page' even though the correct element was not found. This is fixed by using a separate list iterator variable for the loop and only setting the original variable if a suitable element was found. Then determing if the element was found is simply checking if the variable is set. Fixes: 8c242db9b8c0 ("f2fs: fix stale ATOMIC_WRITTEN_PAGE private pointer") Signed-off-by: Jakob Koschel Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b803c9a77cf6..d21776b12d26 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -356,16 +356,19 @@ void f2fs_drop_inmem_page(struct inode *inode, struct page *page) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct list_head *head = &fi->inmem_pages; struct inmem_pages *cur = NULL; + struct inmem_pages *tmp; f2fs_bug_on(sbi, !page_private_atomic(page)); mutex_lock(&fi->inmem_lock); - list_for_each_entry(cur, head, list) { - if (cur->page == page) + list_for_each_entry(tmp, head, list) { + if (tmp->page == page) { + cur = tmp; break; + } } - f2fs_bug_on(sbi, list_empty(head) || cur->page != page); + f2fs_bug_on(sbi, !cur); list_del(&cur->list); mutex_unlock(&fi->inmem_lock); From c5eeeb9543c329c33daccf1ed21afb8830b078af Mon Sep 17 00:00:00 2001 From: Jakob Koschel Date: Tue, 12 Apr 2022 14:20:39 +0200 Subject: [PATCH 405/580] f2fs: Remove usage of list iterator pas the loop for list_move_tail() In preparation to limit the scope of a list iterator to the list traversal loop, the usage of the list iterator variable 'next' should be avoided past the loop body [1]. Instead of calling list_move_tail() on 'next' after the loop, it is called within the loop if the correct location was found. After the loop it covers the case if no location was found and it should be inserted based on the 'head' of the list. Link: https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWXJ6w@mail.gmail.com/ Signed-off-by: Jakob Koschel Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d21776b12d26..abc127f80be4 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -4091,10 +4091,12 @@ static void adjust_sit_entry_set(struct sit_entry_set *ses, return; list_for_each_entry_continue(next, head, set_list) - if (ses->entry_cnt <= next->entry_cnt) - break; + if (ses->entry_cnt <= next->entry_cnt) { + list_move_tail(&ses->set_list, &next->set_list); + return; + } - list_move_tail(&ses->set_list, &next->set_list); + list_move_tail(&ses->set_list, head); } static void add_sit_entry(unsigned int segno, struct list_head *head) From d8b758c9ffa7b274046c9cce081cd9c4a491ca4b Mon Sep 17 00:00:00 2001 From: Jakob Koschel Date: Tue, 12 Apr 2022 14:20:40 +0200 Subject: [PATCH 406/580] f2fs: replace usage of found with dedicated list iterator variable To move the list iterator variable into the list_for_each_entry_*() macro in the future it should be avoided to use the list iterator variable after the loop body. To *never* use the list iterator variable after the loop it was concluded to use a separate iterator variable instead of a found boolean [1]. This removes the need to use a found variable and simply checking if the variable was set, can determine if the break/goto was hit. Link: https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWXJ6w@mail.gmail.com/ Signed-off-by: Jakob Koschel Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index abc127f80be4..4bfb6ff44570 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1672,33 +1672,32 @@ static unsigned int __wait_discard_cmd_range(struct f2fs_sb_info *sbi, struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? &(dcc->fstrim_list) : &(dcc->wait_list); - struct discard_cmd *dc, *tmp; - bool need_wait; + struct discard_cmd *dc = NULL, *iter, *tmp; unsigned int trimmed = 0; next: - need_wait = false; + dc = NULL; mutex_lock(&dcc->cmd_lock); - list_for_each_entry_safe(dc, tmp, wait_list, list) { - if (dc->lstart + dc->len <= start || end <= dc->lstart) + list_for_each_entry_safe(iter, tmp, wait_list, list) { + if (iter->lstart + iter->len <= start || end <= iter->lstart) continue; - if (dc->len < dpolicy->granularity) + if (iter->len < dpolicy->granularity) continue; - if (dc->state == D_DONE && !dc->ref) { - wait_for_completion_io(&dc->wait); - if (!dc->error) - trimmed += dc->len; - __remove_discard_cmd(sbi, dc); + if (iter->state == D_DONE && !iter->ref) { + wait_for_completion_io(&iter->wait); + if (!iter->error) + trimmed += iter->len; + __remove_discard_cmd(sbi, iter); } else { - dc->ref++; - need_wait = true; + iter->ref++; + dc = iter; break; } } mutex_unlock(&dcc->cmd_lock); - if (need_wait) { + if (dc) { trimmed += __wait_one_discard_bio(sbi, dc); goto next; } From f8356b88ccbfc6278509dc85530e1b4f80260752 Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Fri, 15 Apr 2022 21:19:02 +0800 Subject: [PATCH 407/580] f2fs: remove WARN_ON in f2fs_is_valid_blkaddr Syzbot triggers two WARNs in f2fs_is_valid_blkaddr and __is_bitmap_valid. For example, in f2fs_is_valid_blkaddr, if type is DATA_GENERIC_ENHANCE or DATA_GENERIC_ENHANCE_READ, it invokes WARN_ON if blkaddr is not in the right range. The call trace is as follows: f2fs_get_node_info+0x45f/0x1070 read_node_page+0x577/0x1190 __get_node_page.part.0+0x9e/0x10e0 __get_node_page f2fs_get_node_page+0x109/0x180 do_read_inode f2fs_iget+0x2a5/0x58b0 f2fs_fill_super+0x3b39/0x7ca0 Fix these two WARNs by replacing WARN_ON with dump_stack. Reported-by: syzbot+763ae12a2ede1d99d4dc@syzkaller.appspotmail.com Signed-off-by: Dongliang Mu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 13ee979b65cf..62d0ff42f600 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -158,7 +158,7 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr, f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d", blkaddr, exist); set_sbi_flag(sbi, SBI_NEED_FSCK); - WARN_ON(1); + dump_stack(); } return exist; } @@ -196,7 +196,7 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, f2fs_warn(sbi, "access invalid blkaddr:%u", blkaddr); set_sbi_flag(sbi, SBI_NEED_FSCK); - WARN_ON(1); + dump_stack(); return false; } else { return __is_bitmap_valid(sbi, blkaddr, type); From bc6d9fb4e514512be3842e89eb750c7de554afc7 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 18 Apr 2022 16:57:44 -0700 Subject: [PATCH 408/580] f2fs: use flush command instead of FUA for zoned device The block layer for zoned disk can reorder the FUA'ed IOs. Let's use flush command to keep the write order. Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 ++- fs/f2fs/node.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b910920e12a5..c46058bc5303 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -372,7 +372,8 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, f2fs_remove_ino_entry(sbi, ino, APPEND_INO); clear_inode_flag(inode, FI_APPEND_WRITE); flush_out: - if (!atomic && F2FS_OPTION(sbi).fsync_mode != FSYNC_MODE_NOBARRIER) + if ((!atomic && F2FS_OPTION(sbi).fsync_mode != FSYNC_MODE_NOBARRIER) || + (atomic && !test_opt(sbi, NOBARRIER) && f2fs_sb_has_blkzoned(sbi))) ret = f2fs_issue_flush(sbi, inode->i_ino); if (!ret) { f2fs_remove_ino_entry(sbi, ino, UPDATE_INO); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b6d250f954ca..91839a36c316 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1631,7 +1631,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, goto redirty_out; } - if (atomic && !test_opt(sbi, NOBARRIER)) + if (atomic && !test_opt(sbi, NOBARRIER) && !f2fs_sb_has_blkzoned(sbi)) fio.op_flags |= REQ_PREFLUSH | REQ_FUA; /* should add to global list before clearing PAGECACHE status */ From e153c44a63c1ddab6316bcec24f269eb88ed3046 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 29 Mar 2022 16:25:54 -0700 Subject: [PATCH 409/580] f2fs: avoid infinite loop to flush node pages xfstests/generic/475 can give EIO all the time which give an infinite loop to flush node page like below. Let's avoid it. [16418.518551] Call Trace: [16418.518553] ? dm_submit_bio+0x48/0x400 [16418.518574] ? submit_bio_checks+0x1ac/0x5a0 [16418.525207] __submit_bio+0x1a9/0x230 [16418.525210] ? kmem_cache_alloc+0x29e/0x3c0 [16418.525223] submit_bio_noacct+0xa8/0x2b0 [16418.525226] submit_bio+0x4d/0x130 [16418.525238] __submit_bio+0x49/0x310 [f2fs] [16418.525339] ? bio_add_page+0x6a/0x90 [16418.525344] f2fs_submit_page_bio+0x134/0x1f0 [f2fs] [16418.525365] read_node_page+0x125/0x1b0 [f2fs] [16418.525388] __get_node_page.part.0+0x58/0x3f0 [f2fs] [16418.525409] __get_node_page+0x2f/0x60 [f2fs] [16418.525431] f2fs_get_dnode_of_data+0x423/0x860 [f2fs] [16418.525452] ? asm_sysvec_apic_timer_interrupt+0x12/0x20 [16418.525458] ? __mod_memcg_state.part.0+0x2a/0x30 [16418.525465] ? __mod_memcg_lruvec_state+0x27/0x40 [16418.525467] ? __xa_set_mark+0x57/0x70 [16418.525472] f2fs_do_write_data_page+0x10e/0x7b0 [f2fs] [16418.525493] f2fs_write_single_data_page+0x555/0x830 [f2fs] [16418.525514] ? sysvec_apic_timer_interrupt+0x4e/0x90 [16418.525518] ? asm_sysvec_apic_timer_interrupt+0x12/0x20 [16418.525523] f2fs_write_cache_pages+0x303/0x880 [f2fs] [16418.525545] ? blk_flush_plug_list+0x47/0x100 [16418.525548] f2fs_write_data_pages+0xfd/0x320 [f2fs] [16418.525569] do_writepages+0xd5/0x210 [16418.525648] filemap_fdatawrite_wbc+0x7d/0xc0 [16418.525655] filemap_fdatawrite+0x50/0x70 [16418.525658] f2fs_sync_dirty_inodes+0xa4/0x230 [f2fs] [16418.525679] f2fs_write_checkpoint+0x16d/0x1720 [f2fs] [16418.525699] ? ttwu_do_wakeup+0x1c/0x160 [16418.525709] ? ttwu_do_activate+0x6d/0xd0 [16418.525711] ? __wait_for_common+0x11d/0x150 [16418.525715] kill_f2fs_super+0xca/0x100 [f2fs] [16418.525733] deactivate_locked_super+0x3b/0xb0 [16418.525739] deactivate_super+0x40/0x50 [16418.525741] cleanup_mnt+0x139/0x190 [16418.525747] __cleanup_mnt+0x12/0x20 [16418.525749] task_work_run+0x6d/0xa0 [16418.525765] exit_to_user_mode_prepare+0x1ad/0x1b0 [16418.525771] syscall_exit_to_user_mode+0x27/0x50 [16418.525774] do_syscall_64+0x48/0xc0 [16418.525776] entry_SYSCALL_64_after_hwframe+0x44/0xae Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 8 +------- fs/f2fs/f2fs.h | 23 +++++++++++++++++++---- fs/f2fs/node.c | 23 ++++++++++++----------- 3 files changed, 32 insertions(+), 22 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 62d0ff42f600..9571945c056f 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -98,13 +98,7 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, } if (unlikely(!PageUptodate(page))) { - if (page->index == sbi->metapage_eio_ofs) { - if (sbi->metapage_eio_cnt++ == MAX_RETRY_META_PAGE_EIO) - set_ckpt_flags(sbi, CP_ERROR_FLAG); - } else { - sbi->metapage_eio_ofs = page->index; - sbi->metapage_eio_cnt = 0; - } + f2fs_handle_page_eio(sbi, page->index, META); f2fs_put_page(page, 1); return ERR_PTR(-EIO); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 87b7453f043f..01484fef31ee 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -575,8 +575,8 @@ enum { /* maximum retry quota flush count */ #define DEFAULT_RETRY_QUOTA_FLUSH_COUNT 8 -/* maximum retry of EIO'ed meta page */ -#define MAX_RETRY_META_PAGE_EIO 100 +/* maximum retry of EIO'ed page */ +#define MAX_RETRY_PAGE_EIO 100 #define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */ @@ -1618,8 +1618,8 @@ struct f2fs_sb_info { /* keep migration IO order for LFS mode */ struct f2fs_rwsem io_order_lock; mempool_t *write_io_dummy; /* Dummy pages */ - pgoff_t metapage_eio_ofs; /* EIO page offset */ - int metapage_eio_cnt; /* EIO count */ + pgoff_t page_eio_ofs[NR_PAGE_TYPE]; /* EIO page offset */ + int page_eio_cnt[NR_PAGE_TYPE]; /* EIO count */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ @@ -4584,6 +4584,21 @@ static inline void f2fs_io_schedule_timeout(long timeout) io_schedule_timeout(timeout); } +static inline void f2fs_handle_page_eio(struct f2fs_sb_info *sbi, pgoff_t ofs, + enum page_type type) +{ + if (unlikely(f2fs_cp_error(sbi))) + return; + + if (ofs == sbi->page_eio_ofs[type]) { + if (sbi->page_eio_cnt[type]++ == MAX_RETRY_PAGE_EIO) + set_ckpt_flags(sbi, CP_ERROR_FLAG); + } else { + sbi->page_eio_ofs[type] = ofs; + sbi->page_eio_cnt[type] = 0; + } +} + #define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 91839a36c316..b8b055580420 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1416,8 +1416,7 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, err = read_node_page(page, 0); if (err < 0) { - f2fs_put_page(page, 1); - return ERR_PTR(err); + goto out_put_err; } else if (err == LOCKED_PAGE) { err = 0; goto page_hit; @@ -1443,19 +1442,21 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, goto out_err; } page_hit: - if (unlikely(nid != nid_of_node(page))) { - f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", + if (likely(nid == nid_of_node(page))) + return page; + + f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", nid, nid_of_node(page), ino_of_node(page), ofs_of_node(page), cpver_of_node(page), next_blkaddr_of_node(page)); - set_sbi_flag(sbi, SBI_NEED_FSCK); - err = -EINVAL; + set_sbi_flag(sbi, SBI_NEED_FSCK); + err = -EINVAL; out_err: - ClearPageUptodate(page); - f2fs_put_page(page, 1); - return ERR_PTR(err); - } - return page; + ClearPageUptodate(page); +out_put_err: + f2fs_handle_page_eio(sbi, page->index, NODE); + f2fs_put_page(page, 1); + return ERR_PTR(err); } struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) From e41720897c5b7bf576484c50f18fa8c4adc27a36 Mon Sep 17 00:00:00 2001 From: Niels Dossche Date: Fri, 22 Apr 2022 20:05:04 +0200 Subject: [PATCH 410/580] f2fs: extend stat_lock to avoid potential race in statfs There are multiple calculations and reads of fields of sbi that should be protected by stat_lock. As stat_lock is not used to read these values in statfs, this can lead to inconsistent results. Extend the locking to prevent this issue. Commit c9c8ed50d94c ("f2fs: fix to avoid potential race on sbi->unusable_block_count access/update") already added the use of sbi->stat_lock in statfs in order to make the calculation of multiple, different fields atomic so that results are consistent. This is similar to that patch regarding the change in statfs. Signed-off-by: Niels Dossche Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ae17ed862c01..6451d2ea89cd 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1705,18 +1705,23 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) u64 id = huge_encode_dev(sb->s_bdev->bd_dev); block_t total_count, user_block_count, start_count; u64 avail_node_count; + unsigned int total_valid_node_count; total_count = le64_to_cpu(sbi->raw_super->block_count); - user_block_count = sbi->user_block_count; start_count = le32_to_cpu(sbi->raw_super->segment0_blkaddr); buf->f_type = F2FS_SUPER_MAGIC; buf->f_bsize = sbi->blocksize; buf->f_blocks = total_count - start_count; + + spin_lock(&sbi->stat_lock); + + user_block_count = sbi->user_block_count; + total_valid_node_count = valid_node_count(sbi); + avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; buf->f_bfree = user_block_count - valid_user_blocks(sbi) - sbi->current_reserved_blocks; - spin_lock(&sbi->stat_lock); if (unlikely(buf->f_bfree <= sbi->unusable_block_count)) buf->f_bfree = 0; else @@ -1729,14 +1734,12 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) else buf->f_bavail = 0; - avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; - if (avail_node_count > user_block_count) { buf->f_files = user_block_count; buf->f_ffree = buf->f_bavail; } else { buf->f_files = avail_node_count; - buf->f_ffree = min(avail_node_count - valid_node_count(sbi), + buf->f_ffree = min(avail_node_count - total_valid_node_count, buf->f_bavail); } From b9dc8552d04628f6af6fb1bc4b763d39cbde9c46 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 30 Apr 2022 21:19:24 +0800 Subject: [PATCH 411/580] f2fs: fix to clear dirty inode in f2fs_evict_inode() As Yanming reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215904 The kernel message is shown below: kernel BUG at fs/f2fs/inode.c:825! Call Trace: evict+0x282/0x4e0 __dentry_kill+0x2b2/0x4d0 shrink_dentry_list+0x17c/0x4f0 shrink_dcache_parent+0x143/0x1e0 do_one_tree+0x9/0x30 shrink_dcache_for_umount+0x51/0x120 generic_shutdown_super+0x5c/0x3a0 kill_block_super+0x90/0xd0 kill_f2fs_super+0x225/0x310 deactivate_locked_super+0x78/0xc0 cleanup_mnt+0x2b7/0x480 task_work_run+0xc8/0x150 exit_to_user_mode_prepare+0x14a/0x150 syscall_exit_to_user_mode+0x1d/0x40 do_syscall_64+0x48/0x90 The root cause is: inode node and dnode node share the same nid, so during f2fs_evict_inode(), dnode node truncation will invalidate its NAT entry, so when truncating inode node, it fails due to invalid NAT entry, result in inode is still marked as dirty, fix this issue by clearing dirty for inode and setting SBI_NEED_FSCK flag in filesystem. output from dump.f2fs: [print_node_info: 354] Node ID [0xf:15] is inode i_nid[0] [0x f : 15] Cc: stable@vger.kernel.org Reported-by: Ming Yan Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index c76b2153455f..7d8d319fc406 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -796,8 +796,22 @@ void f2fs_evict_inode(struct inode *inode) f2fs_lock_op(sbi); err = f2fs_remove_inode_page(inode); f2fs_unlock_op(sbi); - if (err == -ENOENT) + if (err == -ENOENT) { err = 0; + + /* + * in fuzzed image, another node may has the same + * block address as inode's, if it was truncated + * previously, truncation of inode node will fail. + */ + if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { + f2fs_warn(F2FS_I_SB(inode), + "f2fs_evict_inode: inconsistent node id, ino:%lu", + inode->i_ino); + f2fs_inode_synced(inode); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } + } } /* give more chances, if ENOMEM case */ From 6b375e9852150e506e50294ce47cec4ebe3d660d Mon Sep 17 00:00:00 2001 From: Byungki Lee Date: Fri, 29 Apr 2022 13:29:53 -0700 Subject: [PATCH 412/580] f2fs: write checkpoint during FG_GC If there's not enough free sections each of which consistis of large segments, we can hit no free section for upcoming section allocation. Let's reclaim some prefree segments by writing checkpoints. Signed-off-by: Byungki Lee Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 79ea591f176c..97efa11beb27 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1791,23 +1791,31 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, if (sync) goto stop; - if (has_not_enough_free_secs(sbi, sec_freed, 0)) { - if (skipped_round <= MAX_SKIP_GC_COUNT || - skipped_round * 2 < round) { - segno = NULL_SEGNO; - goto gc_more; - } + if (!has_not_enough_free_secs(sbi, sec_freed, 0)) + goto stop; - if (first_skipped < last_skipped && - (last_skipped - first_skipped) > - sbi->skipped_gc_rwsem) { - f2fs_drop_inmem_pages_all(sbi, true); - segno = NULL_SEGNO; - goto gc_more; - } - if (gc_type == FG_GC && !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) + if (skipped_round <= MAX_SKIP_GC_COUNT || skipped_round * 2 < round) { + + /* Write checkpoint to reclaim prefree segments */ + if (free_sections(sbi) < NR_CURSEG_PERSIST_TYPE && + prefree_segments(sbi) && + !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) { ret = f2fs_write_checkpoint(sbi, &cpc); + if (ret) + goto stop; + } + segno = NULL_SEGNO; + goto gc_more; } + if (first_skipped < last_skipped && + (last_skipped - first_skipped) > + sbi->skipped_gc_rwsem) { + f2fs_drop_inmem_pages_all(sbi, true); + segno = NULL_SEGNO; + goto gc_more; + } + if (gc_type == FG_GC && !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) + ret = f2fs_write_checkpoint(sbi, &cpc); stop: SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno; From e18a1d0103f36ba670387664310784b1eb98f11d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 27 Apr 2022 01:06:02 +0800 Subject: [PATCH 413/580] f2fs: fix to avoid f2fs_bug_on() in dec_valid_node_count() As Yanming reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215897 I have encountered a bug in F2FS file system in kernel v5.17. The kernel should enable CONFIG_KASAN=y and CONFIG_KASAN_INLINE=y. You can reproduce the bug by running the following commands: The kernel message is shown below: kernel BUG at fs/f2fs/f2fs.h:2511! Call Trace: f2fs_remove_inode_page+0x2a2/0x830 f2fs_evict_inode+0x9b7/0x1510 evict+0x282/0x4e0 do_unlinkat+0x33a/0x540 __x64_sys_unlinkat+0x8e/0xd0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae The root cause is: .total_valid_block_count or .total_valid_node_count could fuzzed to zero, then once dec_valid_node_count() was called, it will cause BUG_ON(), this patch fixes to print warning info and set SBI_NEED_FSCK into CP instead of panic. Cc: stable@vger.kernel.org Reported-by: Ming Yan Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 01484fef31ee..42c2e54326b5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2615,11 +2615,17 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, { spin_lock(&sbi->stat_lock); - f2fs_bug_on(sbi, !sbi->total_valid_block_count); - f2fs_bug_on(sbi, !sbi->total_valid_node_count); + if (unlikely(!sbi->total_valid_block_count || + !sbi->total_valid_node_count)) { + f2fs_warn(sbi, "dec_valid_node_count: inconsistent block counts, total_valid_block:%u, total_valid_node:%u", + sbi->total_valid_block_count, + sbi->total_valid_node_count); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } else { + sbi->total_valid_block_count--; + sbi->total_valid_node_count--; + } - sbi->total_valid_node_count--; - sbi->total_valid_block_count--; if (sbi->reserved_blocks && sbi->current_reserved_blocks < sbi->reserved_blocks) sbi->current_reserved_blocks++; From 8eb236ce1fc9afe3cf5efd04e40a12af9f188d9e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 27 Apr 2022 17:51:40 +0800 Subject: [PATCH 414/580] f2fs: fix to do sanity check on block address in f2fs_do_zero_range() As Yanming reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215894 I have encountered a bug in F2FS file system in kernel v5.17. I have uploaded the system call sequence as case.c, and a fuzzed image can be found in google net disk The kernel should enable CONFIG_KASAN=y and CONFIG_KASAN_INLINE=y. You can reproduce the bug by running the following commands: kernel BUG at fs/f2fs/segment.c:2291! Call Trace: f2fs_invalidate_blocks+0x193/0x2d0 f2fs_fallocate+0x2593/0x4a70 vfs_fallocate+0x2a5/0xac0 ksys_fallocate+0x35/0x70 __x64_sys_fallocate+0x8e/0xf0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae The root cause is, after image was fuzzed, block mapping info in inode will be inconsistent with SIT table, so in f2fs_fallocate(), it will cause panic when updating SIT with invalid blkaddr. Let's fix the issue by adding sanity check on block address before updating SIT table with it. Cc: stable@vger.kernel.org Reported-by: Ming Yan Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c46058bc5303..ac21254b2f0c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1456,11 +1456,19 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, ret = -ENOSPC; break; } - if (dn->data_blkaddr != NEW_ADDR) { - f2fs_invalidate_blocks(sbi, dn->data_blkaddr); - dn->data_blkaddr = NEW_ADDR; - f2fs_set_data_blkaddr(dn); + + if (dn->data_blkaddr == NEW_ADDR) + continue; + + if (!f2fs_is_valid_blkaddr(sbi, dn->data_blkaddr, + DATA_GENERIC_ENHANCE)) { + ret = -EFSCORRUPTED; + break; } + + f2fs_invalidate_blocks(sbi, dn->data_blkaddr); + dn->data_blkaddr = NEW_ADDR; + f2fs_set_data_blkaddr(dn); } f2fs_update_extent_cache_range(dn, start, 0, index - start); From 17dd34dd547b34a4463dbdcf9b207dc2fd127493 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 4 May 2022 14:09:22 +0800 Subject: [PATCH 415/580] f2fs: fix deadloop in foreground GC As Yanming reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215914 The root cause is: in a very small sized image, it's very easy to exceed threshold of foreground GC, if we calculate free space and dirty data based on section granularity, in corner case, has_not_enough_free_secs() will always return true, result in deadloop in f2fs_gc(). So this patch refactors has_not_enough_free_secs() as below to fix this issue: 1. calculate needed space based on block granularity, and separate all blocks to two parts, section part, and block part, comparing section part to free section, and comparing block part to free space in openned log. 2. account F2FS_DIRTY_NODES, F2FS_DIRTY_IMETA and F2FS_DIRTY_DENTS as node block consumer; 3. account F2FS_DIRTY_DENTS as data block consumer; Cc: stable@vger.kernel.org Reported-by: Ming Yan Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index aeb008f2377b..60a9b9d89e23 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -572,11 +572,10 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi) return GET_SEC_FROM_SEG(sbi, reserved_segments(sbi)); } -static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi) +static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, + unsigned int node_blocks, unsigned int dent_blocks) { - unsigned int node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) + - get_pages(sbi, F2FS_DIRTY_DENTS); - unsigned int dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int segno, left_blocks; int i; @@ -602,19 +601,28 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi) static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed, int needed) { - int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); - int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); - int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); + unsigned int total_node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) + + get_pages(sbi, F2FS_DIRTY_DENTS) + + get_pages(sbi, F2FS_DIRTY_IMETA); + unsigned int total_dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int node_secs = total_node_blocks / BLKS_PER_SEC(sbi); + unsigned int dent_secs = total_dent_blocks / BLKS_PER_SEC(sbi); + unsigned int node_blocks = total_node_blocks % BLKS_PER_SEC(sbi); + unsigned int dent_blocks = total_dent_blocks % BLKS_PER_SEC(sbi); + unsigned int free, need_lower, need_upper; if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; - if (free_sections(sbi) + freed == reserved_sections(sbi) + needed && - has_curseg_enough_space(sbi)) + free = free_sections(sbi) + freed; + need_lower = node_secs + dent_secs + reserved_sections(sbi) + needed; + need_upper = need_lower + (node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0); + + if (free > need_upper) return false; - return (free_sections(sbi) + freed) <= - (node_secs + 2 * dent_secs + imeta_secs + - reserved_sections(sbi) + needed); + else if (free <= need_lower) + return true; + return !has_curseg_enough_space(sbi, node_blocks, dent_blocks); } static inline bool f2fs_is_checkpoint_ready(struct f2fs_sb_info *sbi) From 075300663e4b7396640837dd81cf5b6de0e406c6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 6 May 2022 09:33:06 +0800 Subject: [PATCH 416/580] f2fs: fix to do sanity check on total_data_blocks As Yanming reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215916 The kernel message is shown below: kernel BUG at fs/f2fs/segment.c:2560! Call Trace: allocate_segment_by_default+0x228/0x440 f2fs_allocate_data_block+0x13d1/0x31f0 do_write_page+0x18d/0x710 f2fs_outplace_write_data+0x151/0x250 f2fs_do_write_data_page+0xef9/0x1980 move_data_page+0x6af/0xbc0 do_garbage_collect+0x312f/0x46f0 f2fs_gc+0x6b0/0x3bc0 f2fs_balance_fs+0x921/0x2260 f2fs_write_single_data_page+0x16be/0x2370 f2fs_write_cache_pages+0x428/0xd00 f2fs_write_data_pages+0x96e/0xd50 do_writepages+0x168/0x550 __writeback_single_inode+0x9f/0x870 writeback_sb_inodes+0x47d/0xb20 __writeback_inodes_wb+0xb2/0x200 wb_writeback+0x4bd/0x660 wb_workfn+0x5f3/0xab0 process_one_work+0x79f/0x13e0 worker_thread+0x89/0xf60 kthread+0x26a/0x300 ret_from_fork+0x22/0x30 RIP: 0010:new_curseg+0xe8d/0x15f0 The root cause is: ckpt.valid_block_count is inconsistent with SIT table, stat info indicates filesystem has free blocks, but SIT table indicates filesystem has no free segment. So that during garbage colloection, it triggers panic when LFS allocator fails to find free segment. This patch tries to fix this issue by checking consistency in between ckpt.valid_block_count and block accounted from SIT. Cc: stable@vger.kernel.org Reported-by: Ming Yan Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 ++-- fs/f2fs/segment.c | 33 ++++++++++++++++++++++----------- fs/f2fs/segment.h | 1 + 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 42c2e54326b5..6b21230a3773 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1116,8 +1116,8 @@ enum count_type { */ #define PAGE_TYPE_OF_BIO(type) ((type) > META ? META : (type)) enum page_type { - DATA, - NODE, + DATA = 0, + NODE = 1, /* should not change this */ META, NR_PAGE_TYPE, META_FLUSH, diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4bfb6ff44570..ff0f082dd368 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -4463,7 +4463,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) unsigned int i, start, end; unsigned int readed, start_blk = 0; int err = 0; - block_t total_node_blocks = 0; + block_t sit_valid_blocks[2] = {0, 0}; do { readed = f2fs_ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES, @@ -4488,8 +4488,8 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) if (err) return err; seg_info_from_raw_sit(se, &sit); - if (IS_NODESEG(se->type)) - total_node_blocks += se->valid_blocks; + + sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks; if (f2fs_block_unit_discard(sbi)) { /* build discard map only one time */ @@ -4529,15 +4529,15 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) sit = sit_in_journal(journal, i); old_valid_blocks = se->valid_blocks; - if (IS_NODESEG(se->type)) - total_node_blocks -= old_valid_blocks; + + sit_valid_blocks[SE_PAGETYPE(se)] -= old_valid_blocks; err = check_block_count(sbi, start, &sit); if (err) break; seg_info_from_raw_sit(se, &sit); - if (IS_NODESEG(se->type)) - total_node_blocks += se->valid_blocks; + + sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks; if (f2fs_block_unit_discard(sbi)) { if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { @@ -4559,13 +4559,24 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) } up_read(&curseg->journal_rwsem); - if (!err && total_node_blocks != valid_node_count(sbi)) { + if (err) + return err; + + if (sit_valid_blocks[NODE] != valid_node_count(sbi)) { f2fs_err(sbi, "SIT is corrupted node# %u vs %u", - total_node_blocks, valid_node_count(sbi)); - err = -EFSCORRUPTED; + sit_valid_blocks[NODE], valid_node_count(sbi)); + return -EFSCORRUPTED; } - return err; + if (sit_valid_blocks[DATA] + sit_valid_blocks[NODE] > + valid_user_blocks(sbi)) { + f2fs_err(sbi, "SIT is corrupted data# %u %u vs %u", + sit_valid_blocks[DATA], sit_valid_blocks[NODE], + valid_user_blocks(sbi)); + return -EFSCORRUPTED; + } + + return 0; } static void init_free_segmap(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 60a9b9d89e23..d253bdbaf6c4 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -24,6 +24,7 @@ #define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA) #define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE && (t) <= CURSEG_COLD_NODE) +#define SE_PAGETYPE(se) ((IS_NODESEG((se)->type) ? NODE : DATA)) static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, unsigned short seg_type) From f33cec797f9c1bea2884df8e7fee238c3cf42e8d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 6 May 2022 18:30:31 +0800 Subject: [PATCH 417/580] f2fs: give priority to select unpinned section for foreground GC Previously, during foreground GC, if victims contain data of pinned file, it will fail migration of the data, and meanwhile i_gc_failures of that pinned file may increase, and when it exceeds threshold, GC will unpin the file, result in breaking pinfile's semantics. In order to mitigate such condition, let's record and skip section which has pinned file's data and give priority to select unpinned one. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 85 +++++++++++++++++++++++++++++++++++++++-------- fs/f2fs/segment.c | 8 +++++ fs/f2fs/segment.h | 3 ++ 3 files changed, 82 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 97efa11beb27..4d1a4df09634 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -646,6 +646,54 @@ static void release_victim_entry(struct f2fs_sb_info *sbi) f2fs_bug_on(sbi, !list_empty(&am->victim_list)); } +static bool f2fs_pin_section(struct f2fs_sb_info *sbi, unsigned int segno) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + + if (!dirty_i->enable_pin_section) + return false; + if (!test_and_set_bit(secno, dirty_i->pinned_secmap)) + dirty_i->pinned_secmap_cnt++; + return true; +} + +static bool f2fs_pinned_section_exists(struct dirty_seglist_info *dirty_i) +{ + return dirty_i->pinned_secmap_cnt; +} + +static bool f2fs_section_is_pinned(struct dirty_seglist_info *dirty_i, + unsigned int secno) +{ + return dirty_i->enable_pin_section && + f2fs_pinned_section_exists(dirty_i) && + test_bit(secno, dirty_i->pinned_secmap); +} + +static void f2fs_unpin_all_sections(struct f2fs_sb_info *sbi, bool enable) +{ + unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); + + if (f2fs_pinned_section_exists(DIRTY_I(sbi))) { + memset(DIRTY_I(sbi)->pinned_secmap, 0, bitmap_size); + DIRTY_I(sbi)->pinned_secmap_cnt = 0; + } + DIRTY_I(sbi)->enable_pin_section = enable; +} + +static int f2fs_gc_pinned_control(struct inode *inode, int gc_type, + unsigned int segno) +{ + if (!f2fs_is_pinned_file(inode)) + return 0; + if (gc_type != FG_GC) + return -EBUSY; + if (!f2fs_pin_section(F2FS_I_SB(inode), segno)) + f2fs_pin_file_control(inode, true); + return -EAGAIN; +} + /* * This function is called from two paths. * One is garbage collection and the other is SSR segment selection. @@ -787,6 +835,9 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) goto next; + if (gc_type == FG_GC && f2fs_section_is_pinned(dirty_i, secno)) + goto next; + if (is_atgc) { add_victim_entry(sbi, &p, segno); goto next; @@ -1201,12 +1252,9 @@ static int move_data_block(struct inode *inode, block_t bidx, goto out; } - if (f2fs_is_pinned_file(inode)) { - if (gc_type == FG_GC) - f2fs_pin_file_control(inode, true); - err = -EAGAIN; + err = f2fs_gc_pinned_control(inode, gc_type, segno); + if (err) goto out; - } set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_dnode_of_data(&dn, bidx, LOOKUP_NODE); @@ -1351,12 +1399,9 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, err = -EAGAIN; goto out; } - if (f2fs_is_pinned_file(inode)) { - if (gc_type == FG_GC) - f2fs_pin_file_control(inode, true); - err = -EAGAIN; + err = f2fs_gc_pinned_control(inode, gc_type, segno); + if (err) goto out; - } if (gc_type == BG_GC) { if (PageWriteback(page)) { @@ -1477,14 +1522,15 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, ofs_in_node = le16_to_cpu(entry->ofs_in_node); if (phase == 3) { + int err; + inode = f2fs_iget(sb, dni.ino); if (IS_ERR(inode) || is_bad_inode(inode) || special_file(inode->i_mode)) continue; - if (is_inode_flag_set(inode, FI_PIN_FILE) && - gc_type == FG_GC) { - f2fs_pin_file_control(inode, true); + err = f2fs_gc_pinned_control(inode, gc_type, segno); + if (err == -EAGAIN) { iput(inode); return submitted; } @@ -1767,9 +1813,17 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, ret = -EINVAL; goto stop; } +retry: ret = __get_victim(sbi, &segno, gc_type); - if (ret) + if (ret) { + /* allow to search victim from sections has pinned data */ + if (ret == -ENODATA && gc_type == FG_GC && + f2fs_pinned_section_exists(DIRTY_I(sbi))) { + f2fs_unpin_all_sections(sbi, false); + goto retry; + } goto stop; + } seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type, force); if (gc_type == FG_GC && @@ -1820,6 +1874,9 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno; + if (gc_type == FG_GC) + f2fs_unpin_all_sections(sbi, true); + trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed, get_pages(sbi, F2FS_DIRTY_NODES), get_pages(sbi, F2FS_DIRTY_DENTS), diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ff0f082dd368..ee8d91151112 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -4656,6 +4656,13 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi) dirty_i->victim_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); if (!dirty_i->victim_secmap) return -ENOMEM; + + dirty_i->pinned_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); + if (!dirty_i->pinned_secmap) + return -ENOMEM; + + dirty_i->pinned_secmap_cnt = 0; + dirty_i->enable_pin_section = true; return 0; } @@ -4889,6 +4896,7 @@ static void destroy_victim_secmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + kvfree(dirty_i->pinned_secmap); kvfree(dirty_i->victim_secmap); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index d253bdbaf6c4..de1d222243cf 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -295,6 +295,9 @@ struct dirty_seglist_info { struct mutex seglist_lock; /* lock for segment bitmaps */ int nr_dirty[NR_DIRTY_TYPE]; /* # of dirty segments */ unsigned long *victim_secmap; /* background GC victims */ + unsigned long *pinned_secmap; /* pinned victims from foreground GC */ + unsigned int pinned_secmap_cnt; /* count of victims which has pinned data */ + bool enable_pin_section; /* enable pinning section */ }; /* victim selection function for cleaning and SSR */ From 4caaf08c4b3223e63d581709eac7962323fcb860 Mon Sep 17 00:00:00 2001 From: Weichao Guo Date: Sat, 7 May 2022 00:28:14 +0800 Subject: [PATCH 418/580] f2fs: skip GC if possible when checkpoint disabling If the number of unusable blocks is not larger than unusable capacity, we can skip GC when checkpoint disabling. Signed-off-by: Weichao Guo Signed-off-by: Chao Yu [Jaegeuk Kim: Fix missing gc_mode assignment] Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 6451d2ea89cd..bb2a05754aef 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2049,7 +2049,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) { unsigned int s_flags = sbi->sb->s_flags; struct cp_control cpc; - unsigned int gc_mode; + unsigned int gc_mode = sbi->gc_mode; int err = 0; int ret; block_t unusable; @@ -2060,9 +2060,13 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) } sbi->sb->s_flags |= SB_ACTIVE; + /* check if we need more GC first */ + unusable = f2fs_get_unusable_blocks(sbi); + if (!f2fs_disable_cp_again(sbi, unusable)) + goto skip_gc; + f2fs_update_time(sbi, DISABLE_TIME); - gc_mode = sbi->gc_mode; sbi->gc_mode = GC_URGENT_HIGH; while (!f2fs_time_over(sbi, DISABLE_TIME)) { @@ -2088,6 +2092,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) goto restore_flag; } +skip_gc: f2fs_down_write(&sbi->gc_lock); cpc.reason = CP_PAUSE; set_sbi_flag(sbi, SBI_CP_DISABLED); From 6817fef5404697ea03f6a4bb312814833843f188 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 6 May 2022 14:17:32 -0700 Subject: [PATCH 419/580] f2fs: stop allocating pinned sections if EAGAIN happens EAGAIN doesn't guarantee to have a free section. Let's report it. Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ac21254b2f0c..de16e46a44f7 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1702,7 +1702,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset, GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { f2fs_down_write(&sbi->gc_lock); err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); - if (err && err != -ENODATA && err != -EAGAIN) + if (err && err != -ENODATA) goto out_err; } From 4f77ff4eab6070d955895da443546a468db37a6d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 5 May 2022 17:40:25 -0700 Subject: [PATCH 420/580] f2fs: don't need inode lock for system hidden quota Let's avoid false-alarmed lockdep warning. [ 58.914674] [T1501146] -> #2 (&sb->s_type->i_mutex_key#20){+.+.}-{3:3}: [ 58.915975] [T1501146] system_server: down_write+0x7c/0xe0 [ 58.916738] [T1501146] system_server: f2fs_quota_sync+0x60/0x1a8 [ 58.917563] [T1501146] system_server: block_operations+0x16c/0x43c [ 58.918410] [T1501146] system_server: f2fs_write_checkpoint+0x114/0x318 [ 58.919312] [T1501146] system_server: f2fs_issue_checkpoint+0x178/0x21c [ 58.920214] [T1501146] system_server: f2fs_sync_fs+0x48/0x6c [ 58.920999] [T1501146] system_server: f2fs_do_sync_file+0x334/0x738 [ 58.921862] [T1501146] system_server: f2fs_sync_file+0x30/0x48 [ 58.922667] [T1501146] system_server: __arm64_sys_fsync+0x84/0xf8 [ 58.923506] [T1501146] system_server: el0_svc_common.llvm.12821150825140585682+0xd8/0x20c [ 58.924604] [T1501146] system_server: do_el0_svc+0x28/0xa0 [ 58.925366] [T1501146] system_server: el0_svc+0x24/0x38 [ 58.926094] [T1501146] system_server: el0_sync_handler+0x88/0xec [ 58.926920] [T1501146] system_server: el0_sync+0x1b4/0x1c0 [ 58.927681] [T1501146] -> #1 (&sbi->cp_global_sem){+.+.}-{3:3}: [ 58.928889] [T1501146] system_server: down_write+0x7c/0xe0 [ 58.929650] [T1501146] system_server: f2fs_write_checkpoint+0xbc/0x318 [ 58.930541] [T1501146] system_server: f2fs_issue_checkpoint+0x178/0x21c [ 58.931443] [T1501146] system_server: f2fs_sync_fs+0x48/0x6c [ 58.932226] [T1501146] system_server: sync_filesystem+0xac/0x130 [ 58.933053] [T1501146] system_server: generic_shutdown_super+0x38/0x150 [ 58.933958] [T1501146] system_server: kill_block_super+0x24/0x58 [ 58.934791] [T1501146] system_server: kill_f2fs_super+0xcc/0x124 [ 58.935618] [T1501146] system_server: deactivate_locked_super+0x90/0x120 [ 58.936529] [T1501146] system_server: deactivate_super+0x74/0xac [ 58.937356] [T1501146] system_server: cleanup_mnt+0x128/0x168 [ 58.938150] [T1501146] system_server: __cleanup_mnt+0x18/0x28 [ 58.938944] [T1501146] system_server: task_work_run+0xb8/0x14c [ 58.939749] [T1501146] system_server: do_notify_resume+0x114/0x1e8 [ 58.940595] [T1501146] system_server: work_pending+0xc/0x5f0 [ 58.941375] [T1501146] -> #0 (&sbi->gc_lock){+.+.}-{3:3}: [ 58.942519] [T1501146] system_server: __lock_acquire+0x1270/0x2868 [ 58.943366] [T1501146] system_server: lock_acquire+0x114/0x294 [ 58.944169] [T1501146] system_server: down_write+0x7c/0xe0 [ 58.944930] [T1501146] system_server: f2fs_issue_checkpoint+0x13c/0x21c [ 58.945831] [T1501146] system_server: f2fs_sync_fs+0x48/0x6c [ 58.946614] [T1501146] system_server: f2fs_do_sync_file+0x334/0x738 [ 58.947472] [T1501146] system_server: f2fs_ioc_commit_atomic_write+0xc8/0x14c [ 58.948439] [T1501146] system_server: __f2fs_ioctl+0x674/0x154c [ 58.949253] [T1501146] system_server: f2fs_ioctl+0x54/0x88 [ 58.950018] [T1501146] system_server: __arm64_sys_ioctl+0xa8/0x110 [ 58.950865] [T1501146] system_server: el0_svc_common.llvm.12821150825140585682+0xd8/0x20c [ 58.951965] [T1501146] system_server: do_el0_svc+0x28/0xa0 [ 58.952727] [T1501146] system_server: el0_svc+0x24/0x38 [ 58.953454] [T1501146] system_server: el0_sync_handler+0x88/0xec [ 58.954279] [T1501146] system_server: el0_sync+0x1b4/0x1c0 Cc: stable@vger.kernel.org Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index bb2a05754aef..b78fa15d948f 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2684,7 +2684,8 @@ int f2fs_quota_sync(struct super_block *sb, int type) if (!sb_has_quota_active(sb, cnt)) continue; - inode_lock(dqopt->files[cnt]); + if (!f2fs_sb_has_quota_ino(sbi)) + inode_lock(dqopt->files[cnt]); /* * do_quotactl @@ -2703,7 +2704,8 @@ int f2fs_quota_sync(struct super_block *sb, int type) f2fs_up_read(&sbi->quota_sem); f2fs_unlock_op(sbi); - inode_unlock(dqopt->files[cnt]); + if (!f2fs_sb_has_quota_ino(sbi)) + inode_unlock(dqopt->files[cnt]); if (ret) break; From fa53d81c586f21d9353cbc512185290941bf30a1 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Thu, 28 Apr 2022 11:18:09 -0700 Subject: [PATCH 421/580] f2fs: change the current atomic write way Current atomic write has three major issues like below. - keeps the updates in non-reclaimable memory space and they are even hard to be migrated, which is not good for contiguous memory allocation. - disk spaces used for atomic files cannot be garbage collected, so this makes it difficult for the filesystem to be defragmented. - If atomic write operations hit the threshold of either memory usage or garbage collection failure count, All the atomic write operations will fail immediately. To resolve the issues, I will keep a COW inode internally for all the updates to be flushed from memory, when we need to flush them out in a situation like high memory pressure. These COW inodes will be tagged as orphan inodes to be reclaimed in case of sudden power-cut or system failure during atomic writes. Signed-off-by: Daeho Jeong Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 180 +++++++++------ fs/f2fs/debug.c | 12 +- fs/f2fs/f2fs.h | 35 +-- fs/f2fs/file.c | 49 +++-- fs/f2fs/gc.c | 27 +-- fs/f2fs/inode.c | 3 +- fs/f2fs/namei.c | 30 ++- fs/f2fs/node.c | 4 - fs/f2fs/node.h | 1 - fs/f2fs/segment.c | 424 +++++++++++++----------------------- fs/f2fs/segment.h | 4 +- fs/f2fs/super.c | 6 +- include/trace/events/f2fs.h | 22 -- 13 files changed, 326 insertions(+), 471 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d18299185fee..ed18093a1ea0 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -67,8 +67,7 @@ static bool __is_cp_guaranteed(struct page *page) if (f2fs_is_compressed_page(page)) return false; - if ((S_ISREG(inode->i_mode) && - (f2fs_is_atomic_file(inode) || IS_NOQUOTA(inode))) || + if ((S_ISREG(inode->i_mode) && IS_NOQUOTA(inode)) || page_private_gcing(page)) return true; return false; @@ -2619,7 +2618,12 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) bool ipu_force = false; int err = 0; - set_new_dnode(&dn, inode, NULL, NULL, 0); + /* Use COW inode to make dnode_of_data for atomic write */ + if (f2fs_is_atomic_file(inode)) + set_new_dnode(&dn, F2FS_I(inode)->cow_inode, NULL, NULL, 0); + else + set_new_dnode(&dn, inode, NULL, NULL, 0); + if (need_inplace_update(fio) && f2fs_lookup_extent_cache(inode, page->index, &ei)) { fio->old_blkaddr = ei.blk + page->index - ei.fofs; @@ -2656,6 +2660,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) err = -EFSCORRUPTED; goto out_writepage; } + /* * If current allocation needs SSR, * it had better in-place writes for updated data. @@ -3371,6 +3376,100 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, return err; } +static int __find_data_block(struct inode *inode, pgoff_t index, + block_t *blk_addr) +{ + struct dnode_of_data dn; + struct page *ipage; + struct extent_info ei = {0, }; + int err = 0; + + ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + + set_new_dnode(&dn, inode, ipage, ipage, 0); + + if (f2fs_lookup_extent_cache(inode, index, &ei)) { + dn.data_blkaddr = ei.blk + index - ei.fofs; + } else { + /* hole case */ + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (err) { + dn.data_blkaddr = NULL_ADDR; + err = 0; + } + } + *blk_addr = dn.data_blkaddr; + f2fs_put_dnode(&dn); + return err; +} + +static int __reserve_data_block(struct inode *inode, pgoff_t index, + block_t *blk_addr, bool *node_changed) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + struct page *ipage; + int err = 0; + + f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); + + ipage = f2fs_get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto unlock_out; + } + set_new_dnode(&dn, inode, ipage, ipage, 0); + + err = f2fs_get_block(&dn, index); + + *blk_addr = dn.data_blkaddr; + *node_changed = dn.node_changed; + f2fs_put_dnode(&dn); + +unlock_out: + f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); + return err; +} + +static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi, + struct page *page, loff_t pos, unsigned int len, + block_t *blk_addr, bool *node_changed) +{ + struct inode *inode = page->mapping->host; + struct inode *cow_inode = F2FS_I(inode)->cow_inode; + pgoff_t index = page->index; + int err = 0; + block_t ori_blk_addr; + + /* If pos is beyond the end of file, reserve a new block in COW inode */ + if ((pos & PAGE_MASK) >= i_size_read(inode)) + return __reserve_data_block(cow_inode, index, blk_addr, + node_changed); + + /* Look for the block in COW inode first */ + err = __find_data_block(cow_inode, index, blk_addr); + if (err) + return err; + else if (*blk_addr != NULL_ADDR) + return 0; + + /* Look for the block in the original inode */ + err = __find_data_block(inode, index, &ori_blk_addr); + if (err) + return err; + + /* Finally, we should reserve a new block in COW inode for the update */ + err = __reserve_data_block(cow_inode, index, blk_addr, node_changed); + if (err) + return err; + + if (ori_blk_addr != NULL_ADDR) + *blk_addr = ori_blk_addr; + return 0; +} + static int f2fs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -3379,7 +3478,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *page = NULL; pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT; - bool need_balance = false, drop_atomic = false; + bool need_balance = false; block_t blkaddr = NULL_ADDR; int err = 0; @@ -3390,14 +3489,6 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, goto fail; } - if ((f2fs_is_atomic_file(inode) && - !f2fs_available_free_memory(sbi, INMEM_PAGES)) || - is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) { - err = -ENOMEM; - drop_atomic = true; - goto fail; - } - /* * We should check this at this moment to avoid deadlock on inode page * and #0 page. The locking rule for inline_data conversion should be: @@ -3445,7 +3536,11 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, *pagep = page; - err = prepare_write_begin(sbi, page, pos, len, + if (f2fs_is_atomic_file(inode)) + err = prepare_atomic_write_begin(sbi, page, pos, len, + &blkaddr, &need_balance); + else + err = prepare_write_begin(sbi, page, pos, len, &blkaddr, &need_balance); if (err) goto fail; @@ -3501,8 +3596,6 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, fail: f2fs_put_page(page, 1); f2fs_write_failed(inode, pos + len); - if (drop_atomic) - f2fs_drop_inmem_pages_all(sbi, false); return err; } @@ -3546,8 +3639,12 @@ static int f2fs_write_end(struct file *file, set_page_dirty(page); if (pos + copied > i_size_read(inode) && - !f2fs_verity_in_progress(inode)) + !f2fs_verity_in_progress(inode)) { f2fs_i_size_write(inode, pos + copied); + if (f2fs_is_atomic_file(inode)) + f2fs_i_size_write(F2FS_I(inode)->cow_inode, + pos + copied); + } unlock_out: f2fs_put_page(page, 1); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); @@ -3724,9 +3821,6 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, inode->i_ino == F2FS_COMPRESS_INO(sbi)) clear_page_private_data(page); - if (page_private_atomic(page)) - return f2fs_drop_inmem_page(inode, page); - detach_page_private(page); set_page_private(page, 0); } @@ -3737,10 +3831,6 @@ int f2fs_release_page(struct page *page, gfp_t wait) if (PageDirty(page)) return 0; - /* This is atomic written page, keep Private */ - if (page_private_atomic(page)) - return 0; - if (test_opt(F2FS_P_SB(page), COMPRESS_CACHE)) { struct inode *inode = page->mapping->host; @@ -3766,18 +3856,6 @@ static int f2fs_set_data_page_dirty(struct page *page) if (PageSwapCache(page)) return __set_page_dirty_nobuffers(page); - if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) { - if (!page_private_atomic(page)) { - f2fs_register_inmem_page(inode, page); - return 1; - } - /* - * Previously, this page has been registered, we just - * return here. - */ - return 0; - } - if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); f2fs_update_dirty_page(inode, page); @@ -3857,42 +3935,14 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode) { - int rc, extra_count; - struct f2fs_inode_info *fi = F2FS_I(mapping->host); - bool atomic_written = page_private_atomic(page); + int rc, extra_count = 0; BUG_ON(PageWriteback(page)); - /* migrating an atomic written page is safe with the inmem_lock hold */ - if (atomic_written) { - if (mode != MIGRATE_SYNC) - return -EBUSY; - if (!mutex_trylock(&fi->inmem_lock)) - return -EAGAIN; - } - - /* one extra reference was held for atomic_write page */ - extra_count = atomic_written ? 1 : 0; rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, extra_count); - if (rc != MIGRATEPAGE_SUCCESS) { - if (atomic_written) - mutex_unlock(&fi->inmem_lock); + if (rc != MIGRATEPAGE_SUCCESS) return rc; - } - - if (atomic_written) { - struct inmem_pages *cur; - - list_for_each_entry(cur, &fi->inmem_pages, list) - if (cur->page == page) { - cur->page = newpage; - break; - } - mutex_unlock(&fi->inmem_lock); - put_page(page); - get_page(newpage); - } /* guarantee to start from no stale private field */ set_page_private(newpage, 0); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index fcdf253cd211..65f0bcf498bb 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -91,7 +91,6 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; si->nquota_files = sbi->nquota_files; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; - si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); si->aw_cnt = sbi->atomic_files; si->vw_cnt = atomic_read(&sbi->vw_cnt); si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); @@ -167,8 +166,6 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID]; si->io_skip_bggc = sbi->io_skip_bggc; si->other_skip_bggc = sbi->other_skip_bggc; - si->skipped_atomic_files[BG_GC] = sbi->skipped_atomic_files[BG_GC]; - si->skipped_atomic_files[FG_GC] = sbi->skipped_atomic_files[FG_GC]; si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) / 2; @@ -296,7 +293,6 @@ static void update_mem_info(struct f2fs_sb_info *sbi) sizeof(struct nat_entry); si->cache_mem += NM_I(sbi)->nat_cnt[DIRTY_NAT] * sizeof(struct nat_entry_set); - si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages); for (i = 0; i < MAX_INO_ENTRY; i++) si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); si->cache_mem += atomic_read(&sbi->total_ext_tree) * @@ -491,10 +487,6 @@ static int stat_show(struct seq_file *s, void *v) si->bg_data_blks); seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks, si->bg_node_blks); - seq_printf(s, "Skipped : atomic write %llu (%llu)\n", - si->skipped_atomic_files[BG_GC] + - si->skipped_atomic_files[FG_GC], - si->skipped_atomic_files[BG_GC]); seq_printf(s, "BG skip : IO: %u, Other: %u\n", si->io_skip_bggc, si->other_skip_bggc); seq_puts(s, "\nExtent Cache:\n"); @@ -519,9 +511,9 @@ static int stat_show(struct seq_file *s, void *v) si->flush_list_empty, si->nr_discarding, si->nr_discarded, si->nr_discard_cmd, si->undiscard_blks); - seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), " + seq_printf(s, " - atomic IO: %4d (Max. %4d), " "volatile IO: %4d (Max. %4d)\n", - si->inmem_pages, si->aw_cnt, si->max_aw_cnt, + si->aw_cnt, si->max_aw_cnt, si->vw_cnt, si->max_vw_cnt); seq_printf(s, " - compress: %4d, hit:%8d\n", si->compress_pages, si->compress_page_hit); seq_printf(s, " - nodes: %4d in %4d\n", diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6b21230a3773..72933c52f977 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -713,7 +713,6 @@ enum { enum { GC_FAILURE_PIN, - GC_FAILURE_ATOMIC, MAX_GC_FAILURE }; @@ -735,7 +734,6 @@ enum { FI_UPDATE_WRITE, /* inode has in-place-update data */ FI_NEED_IPU, /* used for ipu per file */ FI_ATOMIC_FILE, /* indicate atomic file */ - FI_ATOMIC_COMMIT, /* indicate the state of atomical committing */ FI_VOLATILE_FILE, /* indicate volatile file */ FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */ FI_DROP_CACHE, /* drop dirty page cache */ @@ -749,7 +747,6 @@ enum { FI_EXTRA_ATTR, /* indicate file has extra attribute */ FI_PROJ_INHERIT, /* indicate file inherits projectid */ FI_PIN_FILE, /* indicate file should not be gced */ - FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */ FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ FI_COMPRESSED_FILE, /* indicate file's data can be compressed */ FI_COMPRESS_CORRUPT, /* indicate compressed cluster is corrupted */ @@ -791,12 +788,10 @@ struct f2fs_inode_info { #endif struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ - struct list_head inmem_ilist; /* list for inmem inodes */ - struct list_head inmem_pages; /* inmemory pages managed by f2fs */ - struct task_struct *inmem_task; /* store inmemory task */ - struct mutex inmem_lock; /* lock for inmemory pages */ - pgoff_t ra_offset; /* ongoing readahead offset */ + struct task_struct *atomic_write_task; /* store atomic write task */ struct extent_tree *extent_tree; /* cached extent_tree entry */ + pgoff_t ra_offset; /* ongoing readahead offset */ + struct inode *cow_inode; /* copy-on-write inode for atomic write */ /* avoid racing between foreground op and gc */ struct f2fs_rwsem i_gc_rwsem[2]; @@ -1091,7 +1086,6 @@ enum count_type { F2FS_DIRTY_QDATA, F2FS_DIRTY_NODES, F2FS_DIRTY_META, - F2FS_INMEM_PAGES, F2FS_DIRTY_IMETA, F2FS_WB_CP_DATA, F2FS_WB_DATA, @@ -1121,11 +1115,7 @@ enum page_type { META, NR_PAGE_TYPE, META_FLUSH, - INMEM, /* the below types are used by tracepoints only. */ - INMEM_DROP, - INMEM_INVALIDATE, - INMEM_REVOKE, - IPU, + IPU, /* the below types are used by tracepoints only. */ OPU, }; @@ -1722,7 +1712,6 @@ struct f2fs_sb_info { /* for skip statistic */ unsigned int atomic_files; /* # of opened atomic file */ - unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ unsigned long long skipped_gc_rwsem; /* FG_GC only */ /* threshold for gc trials on pinned files */ @@ -3212,11 +3201,6 @@ static inline bool f2fs_is_atomic_file(struct inode *inode) return is_inode_flag_set(inode, FI_ATOMIC_FILE); } -static inline bool f2fs_is_commit_atomic_write(struct inode *inode) -{ - return is_inode_flag_set(inode, FI_ATOMIC_COMMIT); -} - static inline bool f2fs_is_volatile_file(struct inode *inode) { return is_inode_flag_set(inode, FI_VOLATILE_FILE); @@ -3487,6 +3471,8 @@ void f2fs_handle_failed_inode(struct inode *inode); int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, bool hot, bool set); struct dentry *f2fs_get_parent(struct dentry *child); +int f2fs_get_tmpfile(struct inode *dir, + struct inode **new_inode); /* * dir.c @@ -3620,11 +3606,8 @@ void f2fs_destroy_node_manager_caches(void); * segment.c */ bool f2fs_need_SSR(struct f2fs_sb_info *sbi); -void f2fs_register_inmem_page(struct inode *inode, struct page *page); -void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure); -void f2fs_drop_inmem_pages(struct inode *inode); -void f2fs_drop_inmem_page(struct inode *inode, struct page *page); -int f2fs_commit_inmem_pages(struct inode *inode); +int f2fs_commit_atomic_write(struct inode *inode); +void f2fs_abort_atomic_write(struct inode *inode, bool clean); void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg); int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino); @@ -3856,7 +3839,6 @@ struct f2fs_stat_info { int ext_tree, zombie_tree, ext_node; int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta; int ndirty_data, ndirty_qdata; - int inmem_pages; unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits; int free_nids, avail_nids, alloc_nids; @@ -3886,7 +3868,6 @@ struct f2fs_stat_info { int bg_node_segs, bg_data_segs; int tot_blks, data_blks, node_blks; int bg_data_blks, bg_node_blks; - unsigned long long skipped_atomic_files[2]; int curseg[NR_CURSEG_TYPE]; int cursec[NR_CURSEG_TYPE]; int curzone[NR_CURSEG_TYPE]; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index de16e46a44f7..6648e124a59f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1830,9 +1830,8 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) atomic_read(&inode->i_writecount) != 1) return 0; - /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - f2fs_drop_inmem_pages(inode); + f2fs_abort_atomic_write(inode, true); if (f2fs_is_volatile_file(inode)) { set_inode_flag(inode, FI_DROP_CACHE); filemap_fdatawrite(inode->i_mapping); @@ -1854,8 +1853,8 @@ static int f2fs_file_flush(struct file *file, fl_owner_t id) * before dropping file lock, it needs to do in ->flush. */ if (f2fs_is_atomic_file(inode) && - F2FS_I(inode)->inmem_task == current) - f2fs_drop_inmem_pages(inode); + F2FS_I(inode)->atomic_write_task == current) + f2fs_abort_atomic_write(inode, true); return 0; } @@ -2075,6 +2074,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) struct inode *inode = file_inode(filp); struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct inode *pinode; int ret; if (!inode_owner_or_capable(inode)) @@ -2097,11 +2097,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) goto out; } - if (f2fs_is_atomic_file(inode)) { - if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) - ret = -EINVAL; + if (f2fs_is_atomic_file(inode)) goto out; - } ret = f2fs_convert_inline_inode(inode); if (ret) @@ -2122,19 +2119,33 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) goto out; } + /* Create a COW inode for atomic write */ + pinode = f2fs_iget(inode->i_sb, fi->i_pino); + if (IS_ERR(pinode)) { + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + ret = PTR_ERR(pinode); + goto out; + } + + ret = f2fs_get_tmpfile(pinode, &fi->cow_inode); + iput(pinode); + if (ret) { + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + goto out; + } + f2fs_i_size_write(fi->cow_inode, i_size_read(inode)); + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (list_empty(&fi->inmem_ilist)) - list_add_tail(&fi->inmem_ilist, &sbi->inode_list[ATOMIC_FILE]); sbi->atomic_files++; spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - /* add inode in inmem_list first and set atomic_file */ set_inode_flag(inode, FI_ATOMIC_FILE); - clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); + set_inode_flag(fi->cow_inode, FI_ATOMIC_FILE); + clear_inode_flag(fi->cow_inode, FI_INLINE_DATA); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - F2FS_I(inode)->inmem_task = current; + F2FS_I(inode)->atomic_write_task = current; stat_update_max_atomic_write(inode); out: inode_unlock(inode); @@ -2164,21 +2175,17 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) } if (f2fs_is_atomic_file(inode)) { - ret = f2fs_commit_inmem_pages(inode); + ret = f2fs_commit_atomic_write(inode); if (ret) goto err_out; ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); if (!ret) - f2fs_drop_inmem_pages(inode); + f2fs_abort_atomic_write(inode, false); } else { ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); } err_out: - if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) { - clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - ret = -EINVAL; - } inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -2263,15 +2270,13 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) inode_lock(inode); if (f2fs_is_atomic_file(inode)) - f2fs_drop_inmem_pages(inode); + f2fs_abort_atomic_write(inode, true); if (f2fs_is_volatile_file(inode)) { clear_inode_flag(inode, FI_VOLATILE_FILE); stat_dec_volatile_write(inode); ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); } - clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - inode_unlock(inode); mnt_drop_write_file(filp); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 4d1a4df09634..1ea3fb916bec 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1245,13 +1245,6 @@ static int move_data_block(struct inode *inode, block_t bidx, goto out; } - if (f2fs_is_atomic_file(inode)) { - F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++; - F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++; - err = -EAGAIN; - goto out; - } - err = f2fs_gc_pinned_control(inode, gc_type, segno); if (err) goto out; @@ -1393,12 +1386,6 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, goto out; } - if (f2fs_is_atomic_file(inode)) { - F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++; - F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++; - err = -EAGAIN; - goto out; - } err = f2fs_gc_pinned_control(inode, gc_type, segno); if (err) goto out; @@ -1766,8 +1753,6 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, .ilist = LIST_HEAD_INIT(gc_list.ilist), .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; - unsigned long long last_skipped = sbi->skipped_atomic_files[FG_GC]; - unsigned long long first_skipped; unsigned int skipped_round = 0, round = 0; trace_f2fs_gc_begin(sbi->sb, sync, background, @@ -1781,7 +1766,6 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, cpc.reason = __get_cp_reason(sbi); sbi->skipped_gc_rwsem = 0; - first_skipped = last_skipped; gc_more: if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { ret = -EINVAL; @@ -1832,10 +1816,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, total_freed += seg_freed; if (gc_type == FG_GC) { - if (sbi->skipped_atomic_files[FG_GC] > last_skipped || - sbi->skipped_gc_rwsem) + if (sbi->skipped_gc_rwsem) skipped_round++; - last_skipped = sbi->skipped_atomic_files[FG_GC]; round++; } @@ -1861,13 +1843,6 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, segno = NULL_SEGNO; goto gc_more; } - if (first_skipped < last_skipped && - (last_skipped - first_skipped) > - sbi->skipped_gc_rwsem) { - f2fs_drop_inmem_pages_all(sbi, true); - segno = NULL_SEGNO; - goto gc_more; - } if (gc_type == FG_GC && !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) ret = f2fs_write_checkpoint(sbi, &cpc); stop: diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 7d8d319fc406..53fde010d513 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -745,9 +745,8 @@ void f2fs_evict_inode(struct inode *inode) nid_t xnid = F2FS_I(inode)->i_xattr_nid; int err = 0; - /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - f2fs_drop_inmem_pages(inode); + f2fs_abort_atomic_write(inode, true); trace_f2fs_evict_inode(inode); truncate_inode_pages_final(&inode->i_data); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index edfda2740a93..654c85443d91 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -833,8 +833,9 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, return err; } -static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, - umode_t mode, struct inode **whiteout) +static int __f2fs_tmpfile(struct inode *dir, + struct dentry *dentry, umode_t mode, bool is_whiteout, + struct inode **new_inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; @@ -848,7 +849,7 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) return PTR_ERR(inode); - if (whiteout) { + if (is_whiteout) { init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); inode->i_op = &f2fs_special_inode_operations; } else { @@ -873,21 +874,25 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, f2fs_add_orphan_inode(inode); f2fs_alloc_nid_done(sbi, inode->i_ino); - if (whiteout) { + if (is_whiteout) { f2fs_i_links_write(inode, false); spin_lock(&inode->i_lock); inode->i_state |= I_LINKABLE; spin_unlock(&inode->i_lock); - - *whiteout = inode; } else { - d_tmpfile(dentry, inode); + if (dentry) + d_tmpfile(dentry, inode); + else + f2fs_i_links_write(inode, false); } /* link_count was changed by d_tmpfile as well. */ f2fs_unlock_op(sbi); unlock_new_inode(inode); + if (new_inode) + *new_inode = inode; + f2fs_balance_fs(sbi, true); return 0; @@ -907,7 +912,7 @@ static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) if (!f2fs_is_checkpoint_ready(sbi)) return -ENOSPC; - return __f2fs_tmpfile(dir, dentry, mode, NULL); + return __f2fs_tmpfile(dir, dentry, mode, false, NULL); } static int f2fs_create_whiteout(struct inode *dir, struct inode **whiteout) @@ -915,7 +920,14 @@ static int f2fs_create_whiteout(struct inode *dir, struct inode **whiteout) if (unlikely(f2fs_cp_error(F2FS_I_SB(dir)))) return -EIO; - return __f2fs_tmpfile(dir, NULL, S_IFCHR | WHITEOUT_MODE, whiteout); + return __f2fs_tmpfile(dir, NULL, + S_IFCHR | WHITEOUT_MODE, true, whiteout); +} + +int f2fs_get_tmpfile(struct inode *dir, + struct inode **new_inode) +{ + return __f2fs_tmpfile(dir, NULL, S_IFREG, false, new_inode); } static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b8b055580420..b06ccf6fdd5c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -90,10 +90,6 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) atomic_read(&sbi->total_ext_node) * sizeof(struct extent_node)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); - } else if (type == INMEM_PAGES) { - /* it allows 20% / total_ram for inmemory pages */ - mem_size = get_pages(sbi, F2FS_INMEM_PAGES); - res = mem_size < (val.totalram / 5); } else if (type == DISCARD_CACHE) { mem_size = (atomic_read(&dcc->discard_cmd_cnt) * sizeof(struct discard_cmd)) >> PAGE_SHIFT; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 4c1d34bfea78..3c09cae058b0 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -147,7 +147,6 @@ enum mem_type { DIRTY_DENTS, /* indicates dirty dentry pages */ INO_ENTRIES, /* indicates inode entries */ EXTENT_CACHE, /* indicates extent cache */ - INMEM_PAGES, /* indicates inmemory pages */ DISCARD_CACHE, /* indicates memory of cached discard cmds */ COMPRESS_PAGE, /* indicates memory of cached compressed pages */ BASE_CHECK, /* check kernel status */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ee8d91151112..8366d6a28c37 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -29,7 +29,7 @@ static struct kmem_cache *discard_entry_slab; static struct kmem_cache *discard_cmd_slab; static struct kmem_cache *sit_entry_set_slab; -static struct kmem_cache *inmem_entry_slab; +static struct kmem_cache *revoke_entry_slab; static unsigned long __reverse_ulong(unsigned char *str) { @@ -184,308 +184,180 @@ bool f2fs_need_SSR(struct f2fs_sb_info *sbi) SM_I(sbi)->min_ssr_sections + reserved_sections(sbi)); } -void f2fs_register_inmem_page(struct inode *inode, struct page *page) -{ - struct inmem_pages *new; - - set_page_private_atomic(page); - - new = f2fs_kmem_cache_alloc(inmem_entry_slab, - GFP_NOFS, true, NULL); - - /* add atomic page indices to the list */ - new->page = page; - INIT_LIST_HEAD(&new->list); - - /* increase reference count with clean state */ - get_page(page); - mutex_lock(&F2FS_I(inode)->inmem_lock); - list_add_tail(&new->list, &F2FS_I(inode)->inmem_pages); - inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); - mutex_unlock(&F2FS_I(inode)->inmem_lock); - - trace_f2fs_register_inmem_page(page, INMEM); -} - -static int __revoke_inmem_pages(struct inode *inode, - struct list_head *head, bool drop, bool recover, - bool trylock) +void f2fs_abort_atomic_write(struct inode *inode, bool clean) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct inmem_pages *cur, *tmp; - int err = 0; + struct f2fs_inode_info *fi = F2FS_I(inode); + + if (f2fs_is_atomic_file(inode)) { + if (clean) + truncate_inode_pages_final(inode->i_mapping); + clear_inode_flag(fi->cow_inode, FI_ATOMIC_FILE); + iput(fi->cow_inode); + fi->cow_inode = NULL; + clear_inode_flag(inode, FI_ATOMIC_FILE); + + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + sbi->atomic_files--; + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + } +} + +static int __replace_atomic_write_block(struct inode *inode, pgoff_t index, + block_t new_addr, block_t *old_addr, bool recover) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + struct node_info ni; + int err; + +retry: + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE_RA); + if (err) { + if (err == -ENOMEM) { + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); + goto retry; + } + return err; + } + + err = f2fs_get_node_info(sbi, dn.nid, &ni, false); + if (err) { + f2fs_put_dnode(&dn); + return err; + } + + if (recover) { + /* dn.data_blkaddr is always valid */ + if (!__is_valid_data_blkaddr(new_addr)) { + if (new_addr == NULL_ADDR) + dec_valid_block_count(sbi, inode, 1); + f2fs_invalidate_blocks(sbi, dn.data_blkaddr); + f2fs_update_data_blkaddr(&dn, new_addr); + } else { + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, + new_addr, ni.version, true, true); + } + } else { + blkcnt_t count = 1; + + *old_addr = dn.data_blkaddr; + f2fs_truncate_data_blocks_range(&dn, 1); + dec_valid_block_count(sbi, F2FS_I(inode)->cow_inode, count); + inc_valid_block_count(sbi, inode, &count); + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr, + ni.version, true, false); + } + + f2fs_put_dnode(&dn); + return 0; +} + +static void __complete_revoke_list(struct inode *inode, struct list_head *head, + bool revoke) +{ + struct revoke_entry *cur, *tmp; list_for_each_entry_safe(cur, tmp, head, list) { - struct page *page = cur->page; - - if (drop) - trace_f2fs_commit_inmem_page(page, INMEM_DROP); - - if (trylock) { - /* - * to avoid deadlock in between page lock and - * inmem_lock. - */ - if (!trylock_page(page)) - continue; - } else { - lock_page(page); - } - - f2fs_wait_on_page_writeback(page, DATA, true, true); - - if (recover) { - struct dnode_of_data dn; - struct node_info ni; - - trace_f2fs_commit_inmem_page(page, INMEM_REVOKE); -retry: - set_new_dnode(&dn, inode, NULL, NULL, 0); - err = f2fs_get_dnode_of_data(&dn, page->index, - LOOKUP_NODE); - if (err) { - if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); - cond_resched(); - goto retry; - } - err = -EAGAIN; - goto next; - } - - err = f2fs_get_node_info(sbi, dn.nid, &ni, false); - if (err) { - f2fs_put_dnode(&dn); - return err; - } - - if (cur->old_addr == NEW_ADDR) { - f2fs_invalidate_blocks(sbi, dn.data_blkaddr); - f2fs_update_data_blkaddr(&dn, NEW_ADDR); - } else - f2fs_replace_block(sbi, &dn, dn.data_blkaddr, - cur->old_addr, ni.version, true, true); - f2fs_put_dnode(&dn); - } -next: - /* we don't need to invalidate this in the sccessful status */ - if (drop || recover) { - ClearPageUptodate(page); - clear_page_private_gcing(page); - } - detach_page_private(page); - set_page_private(page, 0); - f2fs_put_page(page, 1); - + if (revoke) + __replace_atomic_write_block(inode, cur->index, + cur->old_addr, NULL, true); list_del(&cur->list); - kmem_cache_free(inmem_entry_slab, cur); - dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); + kmem_cache_free(revoke_entry_slab, cur); } - return err; } -void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure) -{ - struct list_head *head = &sbi->inode_list[ATOMIC_FILE]; - struct inode *inode; - struct f2fs_inode_info *fi; - unsigned int count = sbi->atomic_files; - unsigned int looped = 0; -next: - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (list_empty(head)) { - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - return; - } - fi = list_first_entry(head, struct f2fs_inode_info, inmem_ilist); - inode = igrab(&fi->vfs_inode); - if (inode) - list_move_tail(&fi->inmem_ilist, head); - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - - if (inode) { - if (gc_failure) { - if (!fi->i_gc_failures[GC_FAILURE_ATOMIC]) - goto skip; - } - set_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - f2fs_drop_inmem_pages(inode); -skip: - iput(inode); - } - f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); - if (gc_failure) { - if (++looped >= count) - return; - } - goto next; -} - -void f2fs_drop_inmem_pages(struct inode *inode) +static int __f2fs_commit_atomic_write(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); - - do { - mutex_lock(&fi->inmem_lock); - if (list_empty(&fi->inmem_pages)) { - fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; - - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (!list_empty(&fi->inmem_ilist)) - list_del_init(&fi->inmem_ilist); - if (f2fs_is_atomic_file(inode)) { - clear_inode_flag(inode, FI_ATOMIC_FILE); - sbi->atomic_files--; - } - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - - mutex_unlock(&fi->inmem_lock); - break; - } - __revoke_inmem_pages(inode, &fi->inmem_pages, - true, false, true); - mutex_unlock(&fi->inmem_lock); - } while (1); -} - -void f2fs_drop_inmem_page(struct inode *inode, struct page *page) -{ - struct f2fs_inode_info *fi = F2FS_I(inode); - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct list_head *head = &fi->inmem_pages; - struct inmem_pages *cur = NULL; - struct inmem_pages *tmp; - - f2fs_bug_on(sbi, !page_private_atomic(page)); - - mutex_lock(&fi->inmem_lock); - list_for_each_entry(tmp, head, list) { - if (tmp->page == page) { - cur = tmp; - break; - } - } - - f2fs_bug_on(sbi, !cur); - list_del(&cur->list); - mutex_unlock(&fi->inmem_lock); - - dec_page_count(sbi, F2FS_INMEM_PAGES); - kmem_cache_free(inmem_entry_slab, cur); - - ClearPageUptodate(page); - clear_page_private_atomic(page); - f2fs_put_page(page, 0); - - detach_page_private(page); - set_page_private(page, 0); - - trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE); -} - -static int __f2fs_commit_inmem_pages(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_inode_info *fi = F2FS_I(inode); - struct inmem_pages *cur, *tmp; - struct f2fs_io_info fio = { - .sbi = sbi, - .ino = inode->i_ino, - .type = DATA, - .op = REQ_OP_WRITE, - .op_flags = REQ_SYNC | REQ_PRIO, - .io_type = FS_DATA_IO, - }; + struct inode *cow_inode = fi->cow_inode; + struct revoke_entry *new; struct list_head revoke_list; - bool submit_bio = false; - int err = 0; + block_t blkaddr; + struct dnode_of_data dn; + pgoff_t len = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + pgoff_t off = 0, blen, index; + int ret = 0, i; INIT_LIST_HEAD(&revoke_list); - list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { - struct page *page = cur->page; + while (len) { + blen = min_t(pgoff_t, ADDRS_PER_BLOCK(cow_inode), len); - lock_page(page); - if (page->mapping == inode->i_mapping) { - trace_f2fs_commit_inmem_page(page, INMEM); - - f2fs_wait_on_page_writeback(page, DATA, true, true); - - set_page_dirty(page); - if (clear_page_dirty_for_io(page)) { - inode_dec_dirty_pages(inode); - f2fs_remove_dirty_inode(inode); - } -retry: - fio.page = page; - fio.old_blkaddr = NULL_ADDR; - fio.encrypted_page = NULL; - fio.need_lock = LOCK_DONE; - err = f2fs_do_write_data_page(&fio); - if (err) { - if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); - cond_resched(); - goto retry; - } - unlock_page(page); - break; - } - /* record old blkaddr for revoking */ - cur->old_addr = fio.old_blkaddr; - submit_bio = true; + set_new_dnode(&dn, cow_inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, off, LOOKUP_NODE_RA); + if (ret && ret != -ENOENT) { + goto out; + } else if (ret == -ENOENT) { + ret = 0; + if (dn.max_level == 0) + goto out; + goto next; } - unlock_page(page); - list_move_tail(&cur->list, &revoke_list); + + blen = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, cow_inode), + len); + index = off; + for (i = 0; i < blen; i++, dn.ofs_in_node++, index++) { + blkaddr = f2fs_data_blkaddr(&dn); + + if (!__is_valid_data_blkaddr(blkaddr)) { + continue; + } else if (!f2fs_is_valid_blkaddr(sbi, blkaddr, + DATA_GENERIC_ENHANCE)) { + f2fs_put_dnode(&dn); + ret = -EFSCORRUPTED; + goto out; + } + + new = f2fs_kmem_cache_alloc(revoke_entry_slab, GFP_NOFS, + true, NULL); + if (!new) { + f2fs_put_dnode(&dn); + ret = -ENOMEM; + goto out; + } + + ret = __replace_atomic_write_block(inode, index, blkaddr, + &new->old_addr, false); + if (ret) { + f2fs_put_dnode(&dn); + kmem_cache_free(revoke_entry_slab, new); + goto out; + } + + f2fs_update_data_blkaddr(&dn, NULL_ADDR); + new->index = index; + list_add_tail(&new->list, &revoke_list); + } + f2fs_put_dnode(&dn); +next: + off += blen; + len -= blen; } - if (submit_bio) - f2fs_submit_merged_write_cond(sbi, inode, NULL, 0, DATA); +out: + __complete_revoke_list(inode, &revoke_list, ret ? true : false); - if (err) { - /* - * try to revoke all committed pages, but still we could fail - * due to no memory or other reason, if that happened, EAGAIN - * will be returned, which means in such case, transaction is - * already not integrity, caller should use journal to do the - * recovery or rewrite & commit last transaction. For other - * error number, revoking was done by filesystem itself. - */ - err = __revoke_inmem_pages(inode, &revoke_list, - false, true, false); - - /* drop all uncommitted pages */ - __revoke_inmem_pages(inode, &fi->inmem_pages, - true, false, false); - } else { - __revoke_inmem_pages(inode, &revoke_list, - false, false, false); - } - - return err; + return ret; } -int f2fs_commit_inmem_pages(struct inode *inode) +int f2fs_commit_atomic_write(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); int err; - f2fs_balance_fs(sbi, true); + err = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (err) + return err; f2fs_down_write(&fi->i_gc_rwsem[WRITE]); - f2fs_lock_op(sbi); - set_inode_flag(inode, FI_ATOMIC_COMMIT); - mutex_lock(&fi->inmem_lock); - err = __f2fs_commit_inmem_pages(inode); - mutex_unlock(&fi->inmem_lock); - - clear_inode_flag(inode, FI_ATOMIC_COMMIT); + err = __f2fs_commit_atomic_write(inode); f2fs_unlock_op(sbi); f2fs_up_write(&fi->i_gc_rwsem[WRITE]); @@ -5007,9 +4879,9 @@ int __init f2fs_create_segment_manager_caches(void) if (!sit_entry_set_slab) goto destroy_discard_cmd; - inmem_entry_slab = f2fs_kmem_cache_create("f2fs_inmem_page_entry", - sizeof(struct inmem_pages)); - if (!inmem_entry_slab) + revoke_entry_slab = f2fs_kmem_cache_create("f2fs_revoke_entry", + sizeof(struct revoke_entry)); + if (!revoke_entry_slab) goto destroy_sit_entry_set; return 0; @@ -5028,5 +4900,5 @@ void f2fs_destroy_segment_manager_caches(void) kmem_cache_destroy(sit_entry_set_slab); kmem_cache_destroy(discard_cmd_slab); kmem_cache_destroy(discard_entry_slab); - kmem_cache_destroy(inmem_entry_slab); + kmem_cache_destroy(revoke_entry_slab); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index de1d222243cf..00d6bc556a6b 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -225,10 +225,10 @@ struct segment_allocation { #define MAX_SKIP_GC_COUNT 16 -struct inmem_pages { +struct revoke_entry { struct list_head list; - struct page *page; block_t old_addr; /* for revoking when fail to commit */ + pgoff_t index; }; struct sit_info { diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b78fa15d948f..f6ab492af689 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1324,9 +1324,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) spin_lock_init(&fi->i_size_lock); INIT_LIST_HEAD(&fi->dirty_list); INIT_LIST_HEAD(&fi->gdirty_list); - INIT_LIST_HEAD(&fi->inmem_ilist); - INIT_LIST_HEAD(&fi->inmem_pages); - mutex_init(&fi->inmem_lock); init_f2fs_rwsem(&fi->i_gc_rwsem[READ]); init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]); init_f2fs_rwsem(&fi->i_mmap_sem); @@ -1370,9 +1367,8 @@ static int f2fs_drop_inode(struct inode *inode) atomic_inc(&inode->i_count); spin_unlock(&inode->i_lock); - /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - f2fs_drop_inmem_pages(inode); + f2fs_abort_atomic_write(inode, true); /* should remain fi->extent_tree for writepage */ f2fs_destroy_extent_node(inode); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 1bdf7c993f7b..ed2d4f373ac3 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -15,10 +15,6 @@ TRACE_DEFINE_ENUM(NODE); TRACE_DEFINE_ENUM(DATA); TRACE_DEFINE_ENUM(META); TRACE_DEFINE_ENUM(META_FLUSH); -TRACE_DEFINE_ENUM(INMEM); -TRACE_DEFINE_ENUM(INMEM_DROP); -TRACE_DEFINE_ENUM(INMEM_INVALIDATE); -TRACE_DEFINE_ENUM(INMEM_REVOKE); TRACE_DEFINE_ENUM(IPU); TRACE_DEFINE_ENUM(OPU); TRACE_DEFINE_ENUM(HOT); @@ -59,10 +55,6 @@ TRACE_DEFINE_ENUM(CP_RESIZE); { DATA, "DATA" }, \ { META, "META" }, \ { META_FLUSH, "META_FLUSH" }, \ - { INMEM, "INMEM" }, \ - { INMEM_DROP, "INMEM_DROP" }, \ - { INMEM_INVALIDATE, "INMEM_INVALIDATE" }, \ - { INMEM_REVOKE, "INMEM_REVOKE" }, \ { IPU, "IN-PLACE" }, \ { OPU, "OUT-OF-PLACE" }) @@ -1296,20 +1288,6 @@ DEFINE_EVENT(f2fs__page, f2fs_vm_page_mkwrite, TP_ARGS(page, type) ); -DEFINE_EVENT(f2fs__page, f2fs_register_inmem_page, - - TP_PROTO(struct page *page, int type), - - TP_ARGS(page, type) -); - -DEFINE_EVENT(f2fs__page, f2fs_commit_inmem_page, - - TP_PROTO(struct page *page, int type), - - TP_ARGS(page, type) -); - TRACE_EVENT(f2fs_filemap_fault, TP_PROTO(struct inode *inode, pgoff_t index, unsigned long ret), From 552d84cbaa81b6d5e10f01b3a878d13e7f359538 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 5 May 2022 17:49:18 -0700 Subject: [PATCH 422/580] f2fs: kill volatile write support There's no user, since all can use atomic writes simply. Let's kill it. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 +- fs/f2fs/data.c | 5 -- fs/f2fs/debug.c | 10 +--- fs/f2fs/f2fs.h | 27 +---------- fs/f2fs/file.c | 113 ++----------------------------------------- fs/f2fs/segment.c | 3 +- fs/f2fs/verity.c | 2 +- 7 files changed, 10 insertions(+), 154 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 9571945c056f..f837072d1596 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1003,9 +1003,7 @@ static void __add_dirty_inode(struct inode *inode, enum inode_type type) return; set_inode_flag(inode, flag); - if (!f2fs_is_volatile_file(inode)) - list_add_tail(&F2FS_I(inode)->dirty_list, - &sbi->inode_list[type]); + list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]); stat_inc_dirty_inode(sbi, type); } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ed18093a1ea0..aabb35a800a0 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2797,11 +2797,6 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, write: if (f2fs_is_drop_cache(inode)) goto out; - /* we should not write 0'th page having journal header */ - if (f2fs_is_volatile_file(inode) && (!page->index || - (!wbc->for_reclaim && - f2fs_available_free_memory(sbi, BASE_CHECK)))) - goto redirty_out; /* Dentry/quota blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode) || IS_NOQUOTA(inode)) { diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 65f0bcf498bb..c92625ef16d0 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -92,9 +92,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->nquota_files = sbi->nquota_files; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->aw_cnt = sbi->atomic_files; - si->vw_cnt = atomic_read(&sbi->vw_cnt); si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); - si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt); si->nr_dio_read = get_pages(sbi, F2FS_DIO_READ); si->nr_dio_write = get_pages(sbi, F2FS_DIO_WRITE); si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); @@ -511,10 +509,8 @@ static int stat_show(struct seq_file *s, void *v) si->flush_list_empty, si->nr_discarding, si->nr_discarded, si->nr_discard_cmd, si->undiscard_blks); - seq_printf(s, " - atomic IO: %4d (Max. %4d), " - "volatile IO: %4d (Max. %4d)\n", - si->aw_cnt, si->max_aw_cnt, - si->vw_cnt, si->max_vw_cnt); + seq_printf(s, " - atomic IO: %4d (Max. %4d)\n", + si->aw_cnt, si->max_aw_cnt); seq_printf(s, " - compress: %4d, hit:%8d\n", si->compress_pages, si->compress_page_hit); seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); @@ -615,9 +611,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) for (i = META_CP; i < META_MAX; i++) atomic_set(&sbi->meta_count[i], 0); - atomic_set(&sbi->vw_cnt, 0); atomic_set(&sbi->max_aw_cnt, 0); - atomic_set(&sbi->max_vw_cnt, 0); raw_spin_lock_irqsave(&f2fs_stat_lock, flags); list_add_tail(&si->stat_list, &f2fs_stat_list); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 72933c52f977..fc9ba40aac7a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -734,7 +734,6 @@ enum { FI_UPDATE_WRITE, /* inode has in-place-update data */ FI_NEED_IPU, /* used for ipu per file */ FI_ATOMIC_FILE, /* indicate atomic file */ - FI_VOLATILE_FILE, /* indicate volatile file */ FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */ FI_DROP_CACHE, /* drop dirty page cache */ FI_DATA_EXIST, /* indicate data exists */ @@ -1742,9 +1741,7 @@ struct f2fs_sb_info { atomic_t inline_dir; /* # of inline_dentry inodes */ atomic_t compr_inode; /* # of compressed inodes */ atomic64_t compr_blocks; /* # of compressed blocks */ - atomic_t vw_cnt; /* # of volatile writes */ atomic_t max_aw_cnt; /* max # of atomic writes */ - atomic_t max_vw_cnt; /* max # of volatile writes */ unsigned int io_skip_bggc; /* skip background gc for in-flight IO */ unsigned int other_skip_bggc; /* skip background gc for other reasons */ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ @@ -3201,11 +3198,6 @@ static inline bool f2fs_is_atomic_file(struct inode *inode) return is_inode_flag_set(inode, FI_ATOMIC_FILE); } -static inline bool f2fs_is_volatile_file(struct inode *inode) -{ - return is_inode_flag_set(inode, FI_VOLATILE_FILE); -} - static inline bool f2fs_is_first_block_written(struct inode *inode) { return is_inode_flag_set(inode, FI_FIRST_BLOCK_WRITTEN); @@ -3856,7 +3848,7 @@ struct f2fs_stat_info { int inline_xattr, inline_inode, inline_dir, append, update, orphans; int compr_inode; unsigned long long compr_blocks; - int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt; + int aw_cnt, max_aw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; @@ -3967,17 +3959,6 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) if (cur > max) \ atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ } while (0) -#define stat_inc_volatile_write(inode) \ - (atomic_inc(&F2FS_I_SB(inode)->vw_cnt)) -#define stat_dec_volatile_write(inode) \ - (atomic_dec(&F2FS_I_SB(inode)->vw_cnt)) -#define stat_update_max_volatile_write(inode) \ - do { \ - int cur = atomic_read(&F2FS_I_SB(inode)->vw_cnt); \ - int max = atomic_read(&F2FS_I_SB(inode)->max_vw_cnt); \ - if (cur > max) \ - atomic_set(&F2FS_I_SB(inode)->max_vw_cnt, cur); \ - } while (0) #define stat_inc_seg_count(sbi, type, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ @@ -4039,9 +4020,6 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #define stat_add_compr_blocks(inode, blocks) do { } while (0) #define stat_sub_compr_blocks(inode, blocks) do { } while (0) #define stat_update_max_atomic_write(inode) do { } while (0) -#define stat_inc_volatile_write(inode) do { } while (0) -#define stat_dec_volatile_write(inode) do { } while (0) -#define stat_update_max_volatile_write(inode) do { } while (0) #define stat_inc_meta_count(sbi, blkaddr) do { } while (0) #define stat_inc_seg_type(sbi, curseg) do { } while (0) #define stat_inc_block_count(sbi, curseg) do { } while (0) @@ -4457,8 +4435,7 @@ static inline bool f2fs_may_encrypt(struct inode *dir, struct inode *inode) static inline bool f2fs_may_compress(struct inode *inode) { if (IS_SWAPFILE(inode) || f2fs_is_pinned_file(inode) || - f2fs_is_atomic_file(inode) || - f2fs_is_volatile_file(inode)) + f2fs_is_atomic_file(inode)) return false; return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode); } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 6648e124a59f..4276ba10d1de 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1832,13 +1832,6 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) if (f2fs_is_atomic_file(inode)) f2fs_abort_atomic_write(inode, true); - if (f2fs_is_volatile_file(inode)) { - set_inode_flag(inode, FI_DROP_CACHE); - filemap_fdatawrite(inode->i_mapping); - clear_inode_flag(inode, FI_DROP_CACHE); - clear_inode_flag(inode, FI_VOLATILE_FILE); - stat_dec_volatile_write(inode); - } return 0; } @@ -2169,15 +2162,10 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) inode_lock(inode); - if (f2fs_is_volatile_file(inode)) { - ret = -EINVAL; - goto err_out; - } - if (f2fs_is_atomic_file(inode)) { ret = f2fs_commit_atomic_write(inode); if (ret) - goto err_out; + goto unlock_out; ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); if (!ret) @@ -2185,105 +2173,12 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) } else { ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); } -err_out: +unlock_out: inode_unlock(inode); mnt_drop_write_file(filp); return ret; } -static int f2fs_ioc_start_volatile_write(struct file *filp) -{ - struct inode *inode = file_inode(filp); - int ret; - - if (!inode_owner_or_capable(inode)) - return -EACCES; - - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - - ret = mnt_want_write_file(filp); - if (ret) - return ret; - - inode_lock(inode); - - if (f2fs_is_volatile_file(inode)) - goto out; - - ret = f2fs_convert_inline_inode(inode); - if (ret) - goto out; - - stat_inc_volatile_write(inode); - stat_update_max_volatile_write(inode); - - set_inode_flag(inode, FI_VOLATILE_FILE); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); -out: - inode_unlock(inode); - mnt_drop_write_file(filp); - return ret; -} - -static int f2fs_ioc_release_volatile_write(struct file *filp) -{ - struct inode *inode = file_inode(filp); - int ret; - - if (!inode_owner_or_capable(inode)) - return -EACCES; - - ret = mnt_want_write_file(filp); - if (ret) - return ret; - - inode_lock(inode); - - if (!f2fs_is_volatile_file(inode)) - goto out; - - if (!f2fs_is_first_block_written(inode)) { - ret = truncate_partial_data_page(inode, 0, true); - goto out; - } - - ret = punch_hole(inode, 0, F2FS_BLKSIZE); -out: - inode_unlock(inode); - mnt_drop_write_file(filp); - return ret; -} - -static int f2fs_ioc_abort_volatile_write(struct file *filp) -{ - struct inode *inode = file_inode(filp); - int ret; - - if (!inode_owner_or_capable(inode)) - return -EACCES; - - ret = mnt_want_write_file(filp); - if (ret) - return ret; - - inode_lock(inode); - - if (f2fs_is_atomic_file(inode)) - f2fs_abort_atomic_write(inode, true); - if (f2fs_is_volatile_file(inode)) { - clear_inode_flag(inode, FI_VOLATILE_FILE); - stat_dec_volatile_write(inode); - ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); - } - - inode_unlock(inode); - - mnt_drop_write_file(filp); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - return ret; -} - static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -4295,11 +4190,9 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case F2FS_IOC_COMMIT_ATOMIC_WRITE: return f2fs_ioc_commit_atomic_write(filp); case F2FS_IOC_START_VOLATILE_WRITE: - return f2fs_ioc_start_volatile_write(filp); case F2FS_IOC_RELEASE_VOLATILE_WRITE: - return f2fs_ioc_release_volatile_write(filp); case F2FS_IOC_ABORT_VOLATILE_WRITE: - return f2fs_ioc_abort_volatile_write(filp); + return -EOPNOTSUPP; case F2FS_IOC_SHUTDOWN: return f2fs_ioc_shutdown(filp, arg); case FITRIM: diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 8366d6a28c37..10719b7603ac 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3164,8 +3164,7 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) return CURSEG_COLD_DATA; if (file_is_hot(inode) || is_inode_flag_set(inode, FI_HOT_DATA) || - f2fs_is_atomic_file(inode) || - f2fs_is_volatile_file(inode)) + f2fs_is_atomic_file(inode)) return CURSEG_HOT_DATA; return f2fs_rw_hint_to_seg_type(inode->i_write_hint); } else { diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index 074b9f415822..b751f528e5e5 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -128,7 +128,7 @@ static int f2fs_begin_enable_verity(struct file *filp) if (f2fs_verity_in_progress(inode)) return -EBUSY; - if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode)) + if (f2fs_is_atomic_file(inode)) return -EOPNOTSUPP; /* From 4bd3626b99bc53a3a355f0cc7e5a6b473e70b413 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 30 Apr 2022 22:08:52 -0700 Subject: [PATCH 423/580] f2fs: reject test_dummy_encryption when !CONFIG_FS_ENCRYPTION There is no good reason to allow this mount option when the kernel isn't configured with encryption support. Since this option is only for testing, we can just fix this; we don't really need to worry about breaking anyone who might be counting on this option being ignored. Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f6ab492af689..ebcb4329b7d7 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -529,10 +529,11 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb, return -EINVAL; } f2fs_warn(sbi, "Test dummy encryption mode enabled"); -#else - f2fs_warn(sbi, "Test dummy encryption mount option ignored"); -#endif return 0; +#else + f2fs_warn(sbi, "test_dummy_encryption option not supported"); + return -EINVAL; +#endif } #ifdef CONFIG_F2FS_FS_COMPRESSION From bcf03720a39a624af7a154f93d8a5f09b7d0b5a9 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 6 May 2022 11:40:33 -0700 Subject: [PATCH 424/580] f2fs: introduce f2fs_gc_control to consolidate f2fs_gc parameters No functional change. - remove checkpoint=disable check for f2fs_write_checkpoint - get sec_freed all the time Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 11 +++++- fs/f2fs/file.c | 30 ++++++++++++--- fs/f2fs/gc.c | 74 ++++++++++++++++++++----------------- fs/f2fs/segment.c | 8 +++- fs/f2fs/super.c | 8 +++- include/trace/events/f2fs.h | 18 ++++----- 6 files changed, 98 insertions(+), 51 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fc9ba40aac7a..ecde13deff19 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1263,6 +1263,14 @@ struct atgc_management { unsigned long long age_threshold; /* age threshold */ }; +struct f2fs_gc_control { + unsigned int victim_segno; /* target victim segment number */ + int init_gc_type; /* FG_GC or BG_GC */ + bool no_bg_gc; /* check the space and stop bg_gc */ + bool should_migrate_blocks; /* should migrate blocks */ + bool err_gc_skipped; /* return EAGAIN if GC skipped */ +}; + /* For s_flag in struct f2fs_sb_info */ enum { SBI_IS_DIRTY, /* dirty flag for checkpoint */ @@ -3802,8 +3810,7 @@ void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi); int f2fs_start_gc_thread(struct f2fs_sb_info *sbi); void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi); block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode); -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, bool force, - unsigned int segno); +int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control); void f2fs_build_gc_manager(struct f2fs_sb_info *sbi); int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count); int __init f2fs_create_garbage_collection_cache(void); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 4276ba10d1de..7c0eb867e7ef 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1664,6 +1664,10 @@ static int expand_inode_data(struct inode *inode, loff_t offset, struct f2fs_map_blocks map = { .m_next_pgofs = NULL, .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE, .m_may_create = true }; + struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, + .init_gc_type = FG_GC, + .should_migrate_blocks = false, + .err_gc_skipped = true }; pgoff_t pg_start, pg_end; loff_t new_size = i_size_read(inode); loff_t off_end; @@ -1701,7 +1705,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset, if (has_not_enough_free_secs(sbi, 0, GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { f2fs_down_write(&sbi->gc_lock); - err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); + err = f2fs_gc(sbi, &gc_control); if (err && err != -ENODATA) goto out_err; } @@ -2421,6 +2425,9 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, + .no_bg_gc = false, + .should_migrate_blocks = false }; __u32 sync; int ret; @@ -2446,7 +2453,9 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) f2fs_down_write(&sbi->gc_lock); } - ret = f2fs_gc(sbi, sync, true, false, NULL_SEGNO); + gc_control.init_gc_type = sync ? FG_GC : BG_GC; + gc_control.err_gc_skipped = sync; + ret = f2fs_gc(sbi, &gc_control); out: mnt_drop_write_file(filp); return ret; @@ -2455,6 +2464,11 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) static int __f2fs_ioc_gc_range(struct file *filp, struct f2fs_gc_range *range) { struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(filp)); + struct f2fs_gc_control gc_control = { + .init_gc_type = range->sync ? FG_GC : BG_GC, + .no_bg_gc = false, + .should_migrate_blocks = false, + .err_gc_skipped = range->sync }; u64 end; int ret; @@ -2482,8 +2496,8 @@ static int __f2fs_ioc_gc_range(struct file *filp, struct f2fs_gc_range *range) f2fs_down_write(&sbi->gc_lock); } - ret = f2fs_gc(sbi, range->sync, true, false, - GET_SEGNO(sbi, range->start)); + gc_control.victim_segno = GET_SEGNO(sbi, range->start); + ret = f2fs_gc(sbi, &gc_control); if (ret) { if (ret == -EBUSY) ret = -EAGAIN; @@ -2897,6 +2911,10 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) unsigned int start_segno = 0, end_segno = 0; unsigned int dev_start_segno = 0, dev_end_segno = 0; struct f2fs_flush_device range; + struct f2fs_gc_control gc_control = { + .init_gc_type = FG_GC, + .should_migrate_blocks = true, + .err_gc_skipped = true }; int ret; if (!capable(CAP_SYS_ADMIN)) @@ -2940,7 +2958,9 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) sm->last_victim[GC_CB] = end_segno + 1; sm->last_victim[GC_GREEDY] = end_segno + 1; sm->last_victim[ALLOC_NEXT] = end_segno + 1; - ret = f2fs_gc(sbi, true, true, true, start_segno); + + gc_control.victim_segno = start_segno; + ret = f2fs_gc(sbi, &gc_control); if (ret == -EAGAIN) ret = 0; else if (ret < 0) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 1ea3fb916bec..9af3190cfdf7 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -35,6 +35,9 @@ static int gc_thread_func(void *data) wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; wait_queue_head_t *fggc_wq = &sbi->gc_thread->fggc_wq; unsigned int wait_ms; + struct f2fs_gc_control gc_control = { + .victim_segno = NULL_SEGNO, + .should_migrate_blocks = false }; wait_ms = gc_th->min_sleep_time; @@ -141,8 +144,12 @@ static int gc_thread_func(void *data) if (foreground) sync_mode = false; + gc_control.init_gc_type = sync_mode ? FG_GC : BG_GC; + gc_control.no_bg_gc = foreground; + gc_control.err_gc_skipped = sync_mode; + /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, sync_mode, !foreground, false, NULL_SEGNO)) + if (f2fs_gc(sbi, &gc_control)) wait_ms = gc_th->no_gc_sleep_time; if (foreground) @@ -1741,21 +1748,20 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, return seg_freed; } -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, - bool background, bool force, unsigned int segno) +int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control) { - int gc_type = sync ? FG_GC : BG_GC; + int gc_type = gc_control->init_gc_type; + unsigned int segno = gc_control->victim_segno; int sec_freed = 0, seg_freed = 0, total_freed = 0; int ret = 0; struct cp_control cpc; - unsigned int init_segno = segno; struct gc_inode_list gc_list = { .ilist = LIST_HEAD_INIT(gc_list.ilist), .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; unsigned int skipped_round = 0, round = 0; - trace_f2fs_gc_begin(sbi->sb, sync, background, + trace_f2fs_gc_begin(sbi->sb, gc_type, gc_control->no_bg_gc, get_pages(sbi, F2FS_DIRTY_NODES), get_pages(sbi, F2FS_DIRTY_DENTS), get_pages(sbi, F2FS_DIRTY_IMETA), @@ -1782,8 +1788,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, * threshold, we can make them free by checkpoint. Then, we * secure free segments which doesn't need fggc any more. */ - if (prefree_segments(sbi) && - !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) { + if (prefree_segments(sbi)) { ret = f2fs_write_checkpoint(sbi, &cpc); if (ret) goto stop; @@ -1793,7 +1798,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, } /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ - if (gc_type == BG_GC && !background) { + if (gc_type == BG_GC && gc_control->no_bg_gc) { ret = -EINVAL; goto stop; } @@ -1809,45 +1814,48 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, goto stop; } - seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type, force); - if (gc_type == FG_GC && - seg_freed == f2fs_usable_segs_in_sec(sbi, segno)) - sec_freed++; + seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type, + gc_control->should_migrate_blocks); total_freed += seg_freed; - if (gc_type == FG_GC) { - if (sbi->skipped_gc_rwsem) - skipped_round++; - round++; - } + if (seg_freed == f2fs_usable_segs_in_sec(sbi, segno)) + sec_freed++; if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; - if (sync) + if (gc_control->init_gc_type == FG_GC) goto stop; - if (!has_not_enough_free_secs(sbi, sec_freed, 0)) + if (!has_not_enough_free_secs(sbi, + (gc_type == FG_GC) ? sec_freed : 0, 0)) goto stop; - if (skipped_round <= MAX_SKIP_GC_COUNT || skipped_round * 2 < round) { - - /* Write checkpoint to reclaim prefree segments */ - if (free_sections(sbi) < NR_CURSEG_PERSIST_TYPE && - prefree_segments(sbi) && - !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) { + /* FG_GC stops GC by skip_count */ + if (gc_type == FG_GC) { + if (sbi->skipped_gc_rwsem) + skipped_round++; + round++; + if (skipped_round > MAX_SKIP_GC_COUNT && + skipped_round * 2 >= round) { ret = f2fs_write_checkpoint(sbi, &cpc); - if (ret) - goto stop; + goto stop; } - segno = NULL_SEGNO; - goto gc_more; } - if (gc_type == FG_GC && !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) + + /* Write checkpoint to reclaim prefree segments */ + if (free_sections(sbi) < NR_CURSEG_PERSIST_TYPE && + prefree_segments(sbi)) { ret = f2fs_write_checkpoint(sbi, &cpc); + if (ret) + goto stop; + } + segno = NULL_SEGNO; + goto gc_more; + stop: SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; - SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno; + SIT_I(sbi)->last_victim[FLUSH_DEVICE] = gc_control->victim_segno; if (gc_type == FG_GC) f2fs_unpin_all_sections(sbi, true); @@ -1865,7 +1873,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, put_gc_inode(&gc_list); - if (sync && !ret) + if (gc_control->err_gc_skipped && !ret) ret = sec_freed ? 0 : -EAGAIN; return ret; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 10719b7603ac..2d7dee0a93be 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -398,8 +398,14 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) io_schedule(); finish_wait(&sbi->gc_thread->fggc_wq, &wait); } else { + struct f2fs_gc_control gc_control = { + .victim_segno = NULL_SEGNO, + .init_gc_type = BG_GC, + .no_bg_gc = true, + .should_migrate_blocks = false, + .err_gc_skipped = false }; f2fs_down_write(&sbi->gc_lock); - f2fs_gc(sbi, false, false, false, NULL_SEGNO); + f2fs_gc(sbi, &gc_control); } } } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ebcb4329b7d7..d3f19ff322a5 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2067,8 +2067,14 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) sbi->gc_mode = GC_URGENT_HIGH; while (!f2fs_time_over(sbi, DISABLE_TIME)) { + struct f2fs_gc_control gc_control = { + .victim_segno = NULL_SEGNO, + .init_gc_type = FG_GC, + .should_migrate_blocks = false, + .err_gc_skipped = true }; + f2fs_down_write(&sbi->gc_lock); - err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); + err = f2fs_gc(sbi, &gc_control); if (err == -ENODATA) { err = 0; break; diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index ed2d4f373ac3..ad6bdfbedab6 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -653,19 +653,19 @@ TRACE_EVENT(f2fs_background_gc, TRACE_EVENT(f2fs_gc_begin, - TP_PROTO(struct super_block *sb, bool sync, bool background, + TP_PROTO(struct super_block *sb, int gc_type, bool no_bg_gc, long long dirty_nodes, long long dirty_dents, long long dirty_imeta, unsigned int free_sec, unsigned int free_seg, int reserved_seg, unsigned int prefree_seg), - TP_ARGS(sb, sync, background, dirty_nodes, dirty_dents, dirty_imeta, + TP_ARGS(sb, gc_type, no_bg_gc, dirty_nodes, dirty_dents, dirty_imeta, free_sec, free_seg, reserved_seg, prefree_seg), TP_STRUCT__entry( __field(dev_t, dev) - __field(bool, sync) - __field(bool, background) + __field(int, gc_type) + __field(bool, no_bg_gc) __field(long long, dirty_nodes) __field(long long, dirty_dents) __field(long long, dirty_imeta) @@ -677,8 +677,8 @@ TRACE_EVENT(f2fs_gc_begin, TP_fast_assign( __entry->dev = sb->s_dev; - __entry->sync = sync; - __entry->background = background; + __entry->gc_type = gc_type; + __entry->no_bg_gc = no_bg_gc; __entry->dirty_nodes = dirty_nodes; __entry->dirty_dents = dirty_dents; __entry->dirty_imeta = dirty_imeta; @@ -688,12 +688,12 @@ TRACE_EVENT(f2fs_gc_begin, __entry->prefree_seg = prefree_seg; ), - TP_printk("dev = (%d,%d), sync = %d, background = %d, nodes = %lld, " + TP_printk("dev = (%d,%d), gc_type = %s, no_background_GC = %d, nodes = %lld, " "dents = %lld, imeta = %lld, free_sec:%u, free_seg:%u, " "rsv_seg:%d, prefree_seg:%u", show_dev(__entry->dev), - __entry->sync, - __entry->background, + show_gc_type(__entry->gc_type), + (__entry->gc_type == BG_GC) ? __entry->no_bg_gc : -1, __entry->dirty_nodes, __entry->dirty_dents, __entry->dirty_imeta, From 380f3362d8a41edb8baf34340aec925718415a79 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 6 May 2022 14:23:27 -0700 Subject: [PATCH 425/580] f2fs: keep wait_ms if EAGAIN happens In f2fs_gc thread, let's keep wait_ms when sec_freed was zero. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 9af3190cfdf7..8715d4d15b79 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -37,7 +37,8 @@ static int gc_thread_func(void *data) unsigned int wait_ms; struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, - .should_migrate_blocks = false }; + .should_migrate_blocks = false, + .err_gc_skipped = false }; wait_ms = gc_th->min_sleep_time; @@ -146,7 +147,6 @@ static int gc_thread_func(void *data) gc_control.init_gc_type = sync_mode ? FG_GC : BG_GC; gc_control.no_bg_gc = foreground; - gc_control.err_gc_skipped = sync_mode; /* if return value is not zero, no victim was selected */ if (f2fs_gc(sbi, &gc_control)) From 1b2e33e5d14dafdc6f768e5f99e3ea03406ff0cd Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 6 May 2022 13:34:41 -0700 Subject: [PATCH 426/580] f2fs: do not stop GC when requiring a free section The f2fs_gc uses a bitmap to indicate pinned sections, but when disabling chckpoint, we call f2fs_gc() with NULL_SEGNO which selects the same dirty segment as a victim all the time, resulting in checkpoint=disable failure, for example. Let's pick another one, if we fail to collect it. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 12 ++++++++---- fs/f2fs/gc.c | 14 +++++++++----- fs/f2fs/segment.c | 3 ++- fs/f2fs/super.c | 3 ++- include/trace/events/f2fs.h | 11 ++++++++--- 6 files changed, 30 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ecde13deff19..fccaf66c9502 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1269,6 +1269,7 @@ struct f2fs_gc_control { bool no_bg_gc; /* check the space and stop bg_gc */ bool should_migrate_blocks; /* should migrate blocks */ bool err_gc_skipped; /* return EAGAIN if GC skipped */ + unsigned int nr_free_secs; /* # of free sections to do GC */ }; /* For s_flag in struct f2fs_sb_info */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7c0eb867e7ef..a56b864479c1 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1667,7 +1667,8 @@ static int expand_inode_data(struct inode *inode, loff_t offset, struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, .init_gc_type = FG_GC, .should_migrate_blocks = false, - .err_gc_skipped = true }; + .err_gc_skipped = true, + .nr_free_secs = 0 }; pgoff_t pg_start, pg_end; loff_t new_size = i_size_read(inode); loff_t off_end; @@ -2427,7 +2428,8 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, .no_bg_gc = false, - .should_migrate_blocks = false }; + .should_migrate_blocks = false, + .nr_free_secs = 0 }; __u32 sync; int ret; @@ -2468,7 +2470,8 @@ static int __f2fs_ioc_gc_range(struct file *filp, struct f2fs_gc_range *range) .init_gc_type = range->sync ? FG_GC : BG_GC, .no_bg_gc = false, .should_migrate_blocks = false, - .err_gc_skipped = range->sync }; + .err_gc_skipped = range->sync, + .nr_free_secs = 0 }; u64 end; int ret; @@ -2914,7 +2917,8 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) struct f2fs_gc_control gc_control = { .init_gc_type = FG_GC, .should_migrate_blocks = true, - .err_gc_skipped = true }; + .err_gc_skipped = true, + .nr_free_secs = 0 }; int ret; if (!capable(CAP_SYS_ADMIN)) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 8715d4d15b79..818612e421bb 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -147,6 +147,7 @@ static int gc_thread_func(void *data) gc_control.init_gc_type = sync_mode ? FG_GC : BG_GC; gc_control.no_bg_gc = foreground; + gc_control.nr_free_secs = foreground ? 1 : 0; /* if return value is not zero, no victim was selected */ if (f2fs_gc(sbi, &gc_control)) @@ -1762,6 +1763,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control) unsigned int skipped_round = 0, round = 0; trace_f2fs_gc_begin(sbi->sb, gc_type, gc_control->no_bg_gc, + gc_control->nr_free_secs, get_pages(sbi, F2FS_DIRTY_NODES), get_pages(sbi, F2FS_DIRTY_DENTS), get_pages(sbi, F2FS_DIRTY_IMETA), @@ -1824,12 +1826,13 @@ int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control) if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; - if (gc_control->init_gc_type == FG_GC) - goto stop; - - if (!has_not_enough_free_secs(sbi, - (gc_type == FG_GC) ? sec_freed : 0, 0)) + if (gc_control->init_gc_type == FG_GC || + !has_not_enough_free_secs(sbi, + (gc_type == FG_GC) ? sec_freed : 0, 0)) { + if (gc_type == FG_GC && sec_freed < gc_control->nr_free_secs) + goto go_gc_more; goto stop; + } /* FG_GC stops GC by skip_count */ if (gc_type == FG_GC) { @@ -1850,6 +1853,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control) if (ret) goto stop; } +go_gc_more: segno = NULL_SEGNO; goto gc_more; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 2d7dee0a93be..5f5d660426ca 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -403,7 +403,8 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) .init_gc_type = BG_GC, .no_bg_gc = true, .should_migrate_blocks = false, - .err_gc_skipped = false }; + .err_gc_skipped = false, + .nr_free_secs = 1 }; f2fs_down_write(&sbi->gc_lock); f2fs_gc(sbi, &gc_control); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d3f19ff322a5..b420675341fb 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2071,7 +2071,8 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) .victim_segno = NULL_SEGNO, .init_gc_type = FG_GC, .should_migrate_blocks = false, - .err_gc_skipped = true }; + .err_gc_skipped = true, + .nr_free_secs = 1 }; f2fs_down_write(&sbi->gc_lock); err = f2fs_gc(sbi, &gc_control); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index ad6bdfbedab6..cc4a803e7c0a 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -654,18 +654,21 @@ TRACE_EVENT(f2fs_background_gc, TRACE_EVENT(f2fs_gc_begin, TP_PROTO(struct super_block *sb, int gc_type, bool no_bg_gc, + unsigned int nr_free_secs, long long dirty_nodes, long long dirty_dents, long long dirty_imeta, unsigned int free_sec, unsigned int free_seg, int reserved_seg, unsigned int prefree_seg), - TP_ARGS(sb, gc_type, no_bg_gc, dirty_nodes, dirty_dents, dirty_imeta, + TP_ARGS(sb, gc_type, no_bg_gc, nr_free_secs, dirty_nodes, + dirty_dents, dirty_imeta, free_sec, free_seg, reserved_seg, prefree_seg), TP_STRUCT__entry( __field(dev_t, dev) __field(int, gc_type) __field(bool, no_bg_gc) + __field(unsigned int, nr_free_secs) __field(long long, dirty_nodes) __field(long long, dirty_dents) __field(long long, dirty_imeta) @@ -679,6 +682,7 @@ TRACE_EVENT(f2fs_gc_begin, __entry->dev = sb->s_dev; __entry->gc_type = gc_type; __entry->no_bg_gc = no_bg_gc; + __entry->nr_free_secs = nr_free_secs; __entry->dirty_nodes = dirty_nodes; __entry->dirty_dents = dirty_dents; __entry->dirty_imeta = dirty_imeta; @@ -688,12 +692,13 @@ TRACE_EVENT(f2fs_gc_begin, __entry->prefree_seg = prefree_seg; ), - TP_printk("dev = (%d,%d), gc_type = %s, no_background_GC = %d, nodes = %lld, " - "dents = %lld, imeta = %lld, free_sec:%u, free_seg:%u, " + TP_printk("dev = (%d,%d), gc_type = %s, no_background_GC = %d, nr_free_secs = %u, " + "nodes = %lld, dents = %lld, imeta = %lld, free_sec:%u, free_seg:%u, " "rsv_seg:%d, prefree_seg:%u", show_dev(__entry->dev), show_gc_type(__entry->gc_type), (__entry->gc_type == BG_GC) ? __entry->no_bg_gc : -1, + __entry->nr_free_secs, __entry->dirty_nodes, __entry->dirty_dents, __entry->dirty_imeta, From 2ef745a97b093c4cf595b0cb5b8ec44552109073 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 14 May 2022 10:59:29 -0700 Subject: [PATCH 427/580] f2fs: don't use casefolded comparison for "." and ".." Tryng to rename a directory that has all following properties fails with EINVAL and triggers the 'WARN_ON_ONCE(!fscrypt_has_encryption_key(dir))' in f2fs_match_ci_name(): - The directory is casefolded - The directory is encrypted - The directory's encryption key is not yet set up - The parent directory is *not* encrypted The problem is incorrect handling of the lookup of ".." to get the parent reference to update. fscrypt_setup_filename() treats ".." (and ".") specially, as it's never encrypted. It's passed through as-is, and setting up the directory's key is not attempted. As the name isn't a no-key name, f2fs treats it as a "normal" name and attempts a casefolded comparison. That breaks the assumption of the WARN_ON_ONCE() in f2fs_match_ci_name() which assumes that for encrypted directories, casefolded comparisons only happen when the directory's key is set up. We could just remove this WARN_ON_ONCE(). However, since casefolding is always a no-op on "." and ".." anyway, let's instead just not casefold these names. This results in the standard bytewise comparison. Fixes: 7ad08a58bf67 ("f2fs: Handle casefolding with Encryption") Cc: # v5.11+ Signed-off-by: Eric Biggers Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 3 ++- fs/f2fs/f2fs.h | 10 +++++----- fs/f2fs/hash.c | 11 ++++++----- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 315c6472b3c2..4f64965fa5b6 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -82,7 +82,8 @@ int f2fs_init_casefolded_name(const struct inode *dir, #ifdef CONFIG_UNICODE struct super_block *sb = dir->i_sb; - if (IS_CASEFOLDED(dir)) { + if (IS_CASEFOLDED(dir) && + !is_dot_dotdot(fname->usr_fname->name, fname->usr_fname->len)) { fname->cf_name.name = f2fs_kmem_cache_alloc(f2fs_cf_name_slab, GFP_NOFS, false, F2FS_SB(sb)); if (!fname->cf_name.name) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fccaf66c9502..affb86f55156 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -505,11 +505,11 @@ struct f2fs_filename { #ifdef CONFIG_UNICODE /* * For casefolded directories: the casefolded name, but it's left NULL - * if the original name is not valid Unicode, if the directory is both - * casefolded and encrypted and its encryption key is unavailable, or if - * the filesystem is doing an internal operation where usr_fname is also - * NULL. In all these cases we fall back to treating the name as an - * opaque byte sequence. + * if the original name is not valid Unicode, if the original name is + * "." or "..", if the directory is both casefolded and encrypted and + * its encryption key is unavailable, or if the filesystem is doing an + * internal operation where usr_fname is also NULL. In all these cases + * we fall back to treating the name as an opaque byte sequence. */ struct fscrypt_str cf_name; #endif diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index f9b706495d1d..f1431a5c6602 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -92,7 +92,7 @@ static u32 TEA_hash_name(const u8 *p, size_t len) /* * Compute @fname->hash. For all directories, @fname->disk_name must be set. * For casefolded directories, @fname->usr_fname must be set, and also - * @fname->cf_name if the filename is valid Unicode. + * @fname->cf_name if the filename is valid Unicode and is not "." or "..". */ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname) { @@ -111,10 +111,11 @@ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname) /* * If the casefolded name is provided, hash it instead of the * on-disk name. If the casefolded name is *not* provided, that - * should only be because the name wasn't valid Unicode, so fall - * back to treating the name as an opaque byte sequence. Note - * that to handle encrypted directories, the fallback must use - * usr_fname (plaintext) rather than disk_name (ciphertext). + * should only be because the name wasn't valid Unicode or was + * "." or "..", so fall back to treating the name as an opaque + * byte sequence. Note that to handle encrypted directories, + * the fallback must use usr_fname (plaintext) rather than + * disk_name (ciphertext). */ WARN_ON_ONCE(!fname->usr_fname->name); if (fname->cf_name.name) { From 40fa21b59551eea78fea00335581c8b0da6dcbc5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 18 May 2022 20:28:41 +0800 Subject: [PATCH 428/580] f2fs: fix to do sanity check for inline inode Yanming reported a kernel bug in Bugzilla kernel [1], which can be reproduced. The bug message is: The kernel message is shown below: kernel BUG at fs/inode.c:611! Call Trace: evict+0x282/0x4e0 __dentry_kill+0x2b2/0x4d0 dput+0x2dd/0x720 do_renameat2+0x596/0x970 __x64_sys_rename+0x78/0x90 do_syscall_64+0x3b/0x90 [1] https://bugzilla.kernel.org/show_bug.cgi?id=215895 The bug is due to fuzzed inode has both inline_data and encrypted flags. During f2fs_evict_inode(), as the inode was deleted by rename(), it will cause inline data conversion due to conflicting flags. The page cache will be polluted and the panic will be triggered in clear_inode(). Try fixing the bug by doing more sanity checks for inline data inode in sanity_check_inode(). Cc: stable@vger.kernel.org Reported-by: Ming Yan Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/inline.c | 29 ++++++++++++++++++++++++----- fs/f2fs/inode.c | 3 +-- 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index affb86f55156..4176c5daf9f4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4060,6 +4060,7 @@ extern struct kmem_cache *f2fs_inode_entry_slab; * inline.c */ bool f2fs_may_inline_data(struct inode *inode); +bool f2fs_sanity_check_inline_data(struct inode *inode); bool f2fs_may_inline_dentry(struct inode *inode); void f2fs_do_read_inline_data(struct page *page, struct page *ipage); void f2fs_truncate_inline_inode(struct inode *inode, diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 4221c83c92d9..369f0da75d42 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -13,21 +13,40 @@ #include "node.h" #include -bool f2fs_may_inline_data(struct inode *inode) +static bool support_inline_data(struct inode *inode) { if (f2fs_is_atomic_file(inode)) return false; - if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) return false; - if (i_size_read(inode) > MAX_INLINE_DATA(inode)) return false; + return true; +} - if (f2fs_post_read_required(inode)) +bool f2fs_may_inline_data(struct inode *inode) +{ + if (!support_inline_data(inode)) return false; - return true; + return !f2fs_post_read_required(inode); +} + +bool f2fs_sanity_check_inline_data(struct inode *inode) +{ + if (!f2fs_has_inline_data(inode)) + return false; + + if (!support_inline_data(inode)) + return true; + + /* + * used by sanity_check_inode(), when disk layout fields has not + * been synchronized to inmem fields. + */ + return (S_ISREG(inode->i_mode) && + (file_is_encrypt(inode) || file_is_verity(inode) || + (F2FS_I(inode)->i_flags & F2FS_COMPR_FL))); } bool f2fs_may_inline_dentry(struct inode *inode) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 53fde010d513..c9299f09f73e 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -276,8 +276,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) } } - if (f2fs_has_inline_data(inode) && - (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode))) { + if (f2fs_sanity_check_inline_data(inode)) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_data, run fsck to fix", __func__, inode->i_ino, inode->i_mode); From f74ea94cdca35ad0dba6bbcb9790c13d1699dbbd Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 19 May 2022 18:40:10 +0800 Subject: [PATCH 429/580] f2fs: make f2fs_read_inline_data() more readable In f2fs_read_inline_data(), it is confused with checking of inline_data flag, as we checked it before calling. So this patch add some comments for f2fs_has_inline_data(). Signed-off-by: Chao Liu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4176c5daf9f4..3bf53daf2411 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3177,6 +3177,10 @@ static inline int inline_xattr_size(struct inode *inode) return 0; } +/* + * Notice: check inline_data flag without inode page lock is unsafe. + * It could change at any time by f2fs_convert_inline_page(). + */ static inline int f2fs_has_inline_data(struct inode *inode) { return is_inode_flag_set(inode, FI_INLINE_DATA); From dd793619a69d223b55c9d1ea585f6406361e2195 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 21 May 2022 13:11:43 +0200 Subject: [PATCH 430/580] f2fs: fix typo in comment Spelling mistake (triple letters) in comment. Detected with the help of Coccinelle. Signed-off-by: Julia Lawall Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3bf53daf2411..2eccb135d0a7 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1761,7 +1761,7 @@ struct f2fs_sb_info { unsigned int data_io_flag; unsigned int node_io_flag; - /* For sysfs suppport */ + /* For sysfs support */ struct kobject s_kobj; /* /sys/fs/f2fs/ */ struct completion s_kobj_unregister; From c0ffa6a6f213ce851d72e62caa8b98c6cf7bee0b Mon Sep 17 00:00:00 2001 From: Sungjong Seo Date: Tue, 24 May 2022 10:29:11 +0900 Subject: [PATCH 431/580] f2fs: allow compression for mmap files in compress_mode=user Since commit e3c548323d32 ("f2fs: let's allow compression for mmap files"), it has been allowed to compress mmap files. However, in compress_mode=user, it is not allowed yet. To keep the same concept in both compress_modes, f2fs_ioc_(de)compress_file() should also allow it. Let's remove checking mmap files in f2fs_ioc_(de)compress_file() so that the compression for mmap files is also allowed in compress_mode=user. Signed-off-by: Sungjong Seo Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a56b864479c1..2bfbb54896c9 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4084,11 +4084,6 @@ static int f2fs_ioc_decompress_file(struct file *filp, unsigned long arg) goto out; } - if (f2fs_is_mmap_file(inode)) { - ret = -EBUSY; - goto out; - } - ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) goto out; @@ -4156,11 +4151,6 @@ static int f2fs_ioc_compress_file(struct file *filp, unsigned long arg) goto out; } - if (f2fs_is_mmap_file(inode)) { - ret = -EBUSY; - goto out; - } - ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) goto out; From fbf6e725f0f50d4c4312e4f4002e197359e8c337 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 25 May 2022 09:56:34 +0800 Subject: [PATCH 432/580] f2fs: avoid unneeded error handling for revoke_entry_slab allocation In __f2fs_commit_atomic_write(), we will guarantee success of revoke_entry_slab allocation, so let's avoid unneeded error handling. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5f5d660426ca..ba5bd119eb1c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -314,11 +314,6 @@ static int __f2fs_commit_atomic_write(struct inode *inode) new = f2fs_kmem_cache_alloc(revoke_entry_slab, GFP_NOFS, true, NULL); - if (!new) { - f2fs_put_dnode(&dn); - ret = -ENOMEM; - goto out; - } ret = __replace_atomic_write_block(inode, index, blkaddr, &new->old_addr, false); From 8d0cdfaf5b05932369da52f7dcfb54e89d9e226e Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Wed, 25 May 2022 17:43:36 +0800 Subject: [PATCH 433/580] f2fs: add f2fs_init_write_merge_io function Almost all other initialization of variables in f2fs_fill_super are extraced to a single function. Also do it for write_io[], which can make code more clean. This patch just refactors the code, theres no functional change. Signed-off-by: Yufen Yu Reviewed-by: Chao Yu [Jaegeuk Kim: clean up] Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 28 ++++++++++++++++++++++++++++ fs/f2fs/f2fs.h | 1 + fs/f2fs/super.c | 27 +++------------------------ 3 files changed, 32 insertions(+), 24 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index aabb35a800a0..1c1451d18661 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -565,6 +565,34 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode, return false; } +int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi) +{ + int i; + + for (i = 0; i < NR_PAGE_TYPE; i++) { + int n = (i == META) ? 1 : NR_TEMP_TYPE; + int j; + + sbi->write_io[i] = f2fs_kmalloc(sbi, + array_size(n, sizeof(struct f2fs_bio_info)), + GFP_KERNEL); + if (!sbi->write_io[i]) + return -ENOMEM; + + for (j = HOT; j < n; j++) { + init_f2fs_rwsem(&sbi->write_io[i][j].io_rwsem); + sbi->write_io[i][j].sbi = sbi; + sbi->write_io[i][j].bio = NULL; + spin_lock_init(&sbi->write_io[i][j].io_lock); + INIT_LIST_HEAD(&sbi->write_io[i][j].io_list); + INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list); + init_f2fs_rwsem(&sbi->write_io[i][j].bio_list_lock); + } + } + + return 0; +} + static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type, enum temp_type temp) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2eccb135d0a7..12f3b6a16f57 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3752,6 +3752,7 @@ int f2fs_init_bio_entry_cache(void); void f2fs_destroy_bio_entry_cache(void); void f2fs_submit_bio(struct f2fs_sb_info *sbi, struct bio *bio, enum page_type type); +int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi); void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, struct page *page, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b420675341fb..f7b3e3682893 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4042,30 +4042,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) set_sbi_flag(sbi, SBI_POR_DOING); spin_lock_init(&sbi->stat_lock); - for (i = 0; i < NR_PAGE_TYPE; i++) { - int n = (i == META) ? 1 : NR_TEMP_TYPE; - int j; - - sbi->write_io[i] = - f2fs_kmalloc(sbi, - array_size(n, - sizeof(struct f2fs_bio_info)), - GFP_KERNEL); - if (!sbi->write_io[i]) { - err = -ENOMEM; - goto free_bio_info; - } - - for (j = HOT; j < n; j++) { - init_f2fs_rwsem(&sbi->write_io[i][j].io_rwsem); - sbi->write_io[i][j].sbi = sbi; - sbi->write_io[i][j].bio = NULL; - spin_lock_init(&sbi->write_io[i][j].io_lock); - INIT_LIST_HEAD(&sbi->write_io[i][j].io_list); - INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list); - init_f2fs_rwsem(&sbi->write_io[i][j].bio_list_lock); - } - } + err = f2fs_init_write_merge_io(sbi); + if (err) + goto free_bio_info; init_f2fs_rwsem(&sbi->cp_rwsem); init_f2fs_rwsem(&sbi->quota_sem); From e2ef1ce5e5d196a97863eb6497ff8d7024e501ed Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Thu, 26 May 2022 10:21:06 +0800 Subject: [PATCH 434/580] f2fs: replace F2FS_I(inode) and sbi by the local variable We have define 'fi' at the begin of the functions, just use it, rather than use F2FS_I(inode) again. Signed-off-by: Yufen Yu Reviewed-by: Chao Yu [Jaegeuk Kim: replace sbi] Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 22 +++++++++++----------- fs/f2fs/inode.c | 12 ++++++------ 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 12f3b6a16f57..f08358c3f26c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4457,8 +4457,8 @@ static inline bool f2fs_may_compress(struct inode *inode) static inline void f2fs_i_compr_blocks_update(struct inode *inode, u64 blocks, bool add) { - int diff = F2FS_I(inode)->i_cluster_size - blocks; struct f2fs_inode_info *fi = F2FS_I(inode); + int diff = fi->i_cluster_size - blocks; /* don't update i_compr_blocks if saved blocks were released */ if (!add && !atomic_read(&fi->i_compr_blocks)) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 2bfbb54896c9..0568e07a6023 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2102,25 +2102,25 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (ret) goto out; - f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&fi->i_gc_rwsem[WRITE]); /* * Should wait end_io to count F2FS_WB_CP_DATA correctly by * f2fs_is_atomic_file. */ if (get_dirty_pages(inode)) - f2fs_warn(F2FS_I_SB(inode), "Unexpected flush for atomic writes: ino=%lu, npages=%u", + f2fs_warn(sbi, "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) { - f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); goto out; } /* Create a COW inode for atomic write */ pinode = f2fs_iget(inode->i_sb, fi->i_pino); if (IS_ERR(pinode)) { - f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); ret = PTR_ERR(pinode); goto out; } @@ -2128,7 +2128,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) ret = f2fs_get_tmpfile(pinode, &fi->cow_inode); iput(pinode); if (ret) { - f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); goto out; } f2fs_i_size_write(fi->cow_inode, i_size_read(inode)); @@ -2140,10 +2140,10 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) set_inode_flag(inode, FI_ATOMIC_FILE); set_inode_flag(fi->cow_inode, FI_ATOMIC_FILE); clear_inode_flag(fi->cow_inode, FI_INLINE_DATA); - f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - F2FS_I(inode)->atomic_write_task = current; + f2fs_update_time(sbi, REQ_TIME); + fi->atomic_write_task = current; stat_update_max_atomic_write(inode); out: inode_unlock(inode); @@ -3026,7 +3026,7 @@ static int f2fs_ioc_setproject(struct file *filp, __u32 projid) kprojid = make_kprojid(&init_user_ns, (projid_t)projid); - if (projid_eq(kprojid, F2FS_I(inode)->i_projid)) + if (projid_eq(kprojid, fi->i_projid)) return 0; err = -EPERM; @@ -3046,7 +3046,7 @@ static int f2fs_ioc_setproject(struct file *filp, __u32 projid) if (err) goto out_unlock; - F2FS_I(inode)->i_projid = kprojid; + fi->i_projid = kprojid; inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, true); out_unlock: @@ -4061,7 +4061,7 @@ static int f2fs_ioc_decompress_file(struct file *filp, unsigned long arg) struct f2fs_inode_info *fi = F2FS_I(inode); pgoff_t page_idx = 0, last_idx; unsigned int blk_per_seg = sbi->blocks_per_seg; - int cluster_size = F2FS_I(inode)->i_cluster_size; + int cluster_size = fi->i_cluster_size; int count, ret; if (!f2fs_sb_has_compression(sbi) || diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index c9299f09f73e..52ac0122f62a 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -260,8 +260,8 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } - if (F2FS_I(inode)->extent_tree) { - struct extent_info *ei = &F2FS_I(inode)->extent_tree->largest; + if (fi->extent_tree) { + struct extent_info *ei = &fi->extent_tree->largest; if (ei->len && (!f2fs_is_valid_blkaddr(sbi, ei->blk, @@ -465,10 +465,10 @@ static int do_read_inode(struct inode *inode) } } - F2FS_I(inode)->i_disk_time[0] = inode->i_atime; - F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; - F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; - F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; + fi->i_disk_time[0] = inode->i_atime; + fi->i_disk_time[1] = inode->i_ctime; + fi->i_disk_time[2] = inode->i_mtime; + fi->i_disk_time[3] = fi->i_crtime; f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); From bb2449b3b611e83ea70e79d126a3054d903002b6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 27 May 2022 12:13:30 +0800 Subject: [PATCH 435/580] f2fs: fix to tag gcing flag on page during file defragment In order to garantee migrated data be persisted during checkpoint, otherwise out-of-order persistency between data and node may cause data corruption after SPOR. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 0568e07a6023..a5aa5340445a 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2675,6 +2675,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, } set_page_dirty(page); + set_page_private_gcing(page); f2fs_put_page(page, 1); idx++; From 02775956e26a72e7a2e37612a8d3867aad802d0e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 31 May 2022 18:27:09 -0700 Subject: [PATCH 436/580] f2fs: attach inline_data after setting compression This fixes the below corruption. [345393.335389] F2FS-fs (vdb): sanity_check_inode: inode (ino=6d0, mode=33206) should not have inline_data, run fsck to fix Fixes: 677a82b44ebf ("f2fs: fix to do sanity check for inline inode") Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 654c85443d91..d35a58d47f12 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -83,8 +83,6 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (test_opt(sbi, INLINE_XATTR)) set_inode_flag(inode, FI_INLINE_XATTR); - if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) - set_inode_flag(inode, FI_INLINE_DATA); if (f2fs_may_inline_dentry(inode)) set_inode_flag(inode, FI_INLINE_DENTRY); @@ -101,10 +99,6 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) f2fs_init_extent_tree(inode, NULL); - stat_inc_inline_xattr(inode); - stat_inc_inline_inode(inode); - stat_inc_inline_dir(inode); - F2FS_I(inode)->i_flags = f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED); @@ -121,6 +115,14 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) set_compress_context(inode); } + /* Should enable inline_data after compression set */ + if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) + set_inode_flag(inode, FI_INLINE_DATA); + + stat_inc_inline_xattr(inode); + stat_inc_inline_inode(inode); + stat_inc_inline_dir(inode); + f2fs_set_inode_flags(inode); trace_f2fs_new_inode(inode, 0); @@ -319,6 +321,8 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode, if (!is_extension_exist(name, ext[i], false)) continue; + /* Do not use inline_data with compression */ + clear_inode_flag(inode, FI_INLINE_DATA); set_compress_context(inode); return; } From d1dcdb9fb086a4923614268d7237f7bf8b41dd29 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 31 May 2022 18:27:09 -0700 Subject: [PATCH 437/580] f2fs: attach inline_data after setting compression This fixes the below corruption. [345393.335389] F2FS-fs (vdb): sanity_check_inode: inode (ino=6d0, mode=33206) should not have inline_data, run fsck to fix Cc: Fixes: 677a82b44ebf ("f2fs: fix to do sanity check for inline inode") Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index d35a58d47f12..b2ebd64f8108 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -322,6 +322,7 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode, continue; /* Do not use inline_data with compression */ + stat_dec_inline_inode(inode); clear_inode_flag(inode, FI_INLINE_DATA); set_compress_context(inode); return; From c9a5d0a14799f1c3adb7618f9a616e0dddb371c1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 8 Jun 2022 09:19:56 -0700 Subject: [PATCH 438/580] f2fs: run GCs synchronously given user requests When users set GC_URGENT or GC_MID, they expected to do GCs right away. But, there's a condition to bypass it. Let's indicate we need to do now in the thread. Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 818612e421bb..f55f41802dc9 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -37,7 +37,6 @@ static int gc_thread_func(void *data) unsigned int wait_ms; struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, - .should_migrate_blocks = false, .err_gc_skipped = false }; wait_ms = gc_th->min_sleep_time; @@ -113,7 +112,10 @@ static int gc_thread_func(void *data) sbi->gc_mode == GC_URGENT_MID) { wait_ms = gc_th->urgent_sleep_time; f2fs_down_write(&sbi->gc_lock); + gc_control.should_migrate_blocks = true; goto do_gc; + } else { + gc_control.should_migrate_blocks = false; } if (foreground) { @@ -139,7 +141,9 @@ static int gc_thread_func(void *data) if (!foreground) stat_inc_bggc_count(sbi->stat_info); - sync_mode = F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC; + sync_mode = F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC || + sbi->gc_mode == GC_URGENT_HIGH || + sbi->gc_mode == GC_URGENT_MID; /* foreground GC was been triggered via f2fs_balance_fs() */ if (foreground) From c0d87eeff1954516d9ea04eb30d3a56b6e680b1f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 18 Jun 2022 00:42:24 -0700 Subject: [PATCH 439/580] f2fs: do not count ENOENT for error case Otherwise, we can get a wrong cp_error mark. Cc: Fixes: a7b8618aa2f0 ("f2fs: avoid infinite loop to flush node pages") Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b06ccf6fdd5c..c4fbebc0a487 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1450,7 +1450,9 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, out_err: ClearPageUptodate(page); out_put_err: - f2fs_handle_page_eio(sbi, page->index, NODE); + /* ENOENT comes from read_node_page which is not an error. */ + if (err != -ENOENT) + f2fs_handle_page_eio(sbi, page->index, NODE); f2fs_put_page(page, 1); return ERR_PTR(err); } From bd30e7c9fe200bf35a6e23bd2a6ae3792f828475 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 11 Jun 2022 10:55:43 -0700 Subject: [PATCH 440/580] f2fs: do not skip updating inode when retrying to flush node page Let's try to flush dirty inode again to improve subtle i_blocks mismatch. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index c4fbebc0a487..9cb66f35d191 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1945,7 +1945,6 @@ int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; bool submitted = false; - bool may_dirty = true; /* give a priority to WB_SYNC threads */ if (atomic_read(&sbi->wb_sync_req[NODE]) && @@ -1998,11 +1997,8 @@ int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, } /* flush dirty inode */ - if (IS_INODE(page) && may_dirty) { - may_dirty = false; - if (flush_dirty_inode(page)) - goto lock_node; - } + if (IS_INODE(page) && flush_dirty_inode(page)) + goto lock_node; write_node: f2fs_wait_on_page_writeback(page, NODE, true, true); From ce290a15c9783728d3c98ba0b3a501d27ed973d4 Mon Sep 17 00:00:00 2001 From: Jack Qiu Date: Tue, 31 May 2022 09:16:56 +0800 Subject: [PATCH 441/580] f2fs: optimize error handling in redirty_blocks Current error handling is at risk of page leaks. However, we dot't seek any failure scenarios, just use f2fs_bug_on. Signed-off-by: Jack Qiu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a5aa5340445a..760de3e4fb50 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4043,10 +4043,10 @@ static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len) for (i = 0; i < page_len; i++, redirty_idx++) { page = find_lock_page(mapping, redirty_idx); - if (!page) { - ret = -ENOMEM; - break; - } + + /* It will never fail, when page has pinned above */ + f2fs_bug_on(F2FS_I_SB(inode), !page); + set_page_dirty(page); f2fs_put_page(page, 1); f2fs_put_page(page, 0); From 31f7a8c6e6ca9507c80faf9624e900d6ce924f54 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 19 Jun 2022 01:51:55 +0800 Subject: [PATCH 442/580] f2fs: initialize page_array_entry slab only if compression feature is on Otherwise, in image which doesn't support compression feature, page_array_entry will be initialized w/o use. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index e0409daf1ff0..eac3d13f9580 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1878,6 +1878,9 @@ int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) dev_t dev = sbi->sb->s_bdev->bd_dev; char slab_name[32]; + if (!f2fs_sb_has_compression(sbi)) + return 0; + sprintf(slab_name, "f2fs_page_array_entry-%u:%u", MAJOR(dev), MINOR(dev)); sbi->page_array_slab_size = sizeof(struct page *) << From db042a491db4e563c727f307a76059d2d3ad72f4 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Mon, 20 Jun 2022 10:38:42 -0700 Subject: [PATCH 443/580] f2fs: introduce memory mode Introduce memory mode to supports "normal" and "low" memory modes. "low" mode is to support low memory devices. Because of the nature of low memory devices, in this mode, f2fs will try to save memory sometimes by sacrificing performance. "normal" mode is the default mode and same as before. Signed-off-by: Daeho Jeong Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 5 +++++ fs/f2fs/f2fs.h | 13 +++++++++++++ fs/f2fs/super.c | 24 ++++++++++++++++++++++++ 3 files changed, 42 insertions(+) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index d294d04c06d0..3a5e309823ba 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -336,6 +336,11 @@ discard_unit=%s Control discard unit, the argument can be "block", "segment" default, it is helpful for large sized SMR or ZNS devices to reduce memory cost by getting rid of fs metadata supports small discard. +memory=%s Control memory mode. This supports "normal" and "low" modes. + "low" mode is introduced to support low memory devices. + Because of the nature of low memory devices, in this mode, f2fs + will try to save memory sometimes by sacrificing performance. + "normal" mode is the default mode and same as before. ======================== ============================================================ Debugfs Entries diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f08358c3f26c..43ab2a0831a3 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -155,6 +155,7 @@ struct f2fs_mount_info { int fsync_mode; /* fsync policy */ int fs_mode; /* fs mode: LFS or ADAPTIVE */ int bggc_mode; /* bggc mode: off, on or sync */ + int memory_mode; /* memory mode */ int discard_unit; /* * discard command's offset/size should * be aligned to this unit: block, @@ -1357,6 +1358,13 @@ enum { DISCARD_UNIT_SECTION, /* basic discard unit is section */ }; +enum { + MEMORY_MODE_NORMAL, /* memory mode for normal devices */ + MEMORY_MODE_LOW, /* memory mode for low memry devices */ +}; + + + static inline int f2fs_test_bit(unsigned int nr, char *addr); static inline void f2fs_set_bit(unsigned int nr, char *addr); static inline void f2fs_clear_bit(unsigned int nr, char *addr); @@ -4446,6 +4454,11 @@ static inline bool f2fs_may_encrypt(struct inode *dir, struct inode *inode) return false; } +static inline bool f2fs_low_mem_mode(struct f2fs_sb_info *sbi) +{ + return F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_LOW; +} + static inline bool f2fs_may_compress(struct inode *inode) { if (IS_SWAPFILE(inode) || f2fs_is_pinned_file(inode) || diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f7b3e3682893..0fae4b285971 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -157,6 +157,7 @@ enum { Opt_gc_merge, Opt_nogc_merge, Opt_discard_unit, + Opt_memory_mode, Opt_err, }; @@ -232,6 +233,7 @@ static match_table_t f2fs_tokens = { {Opt_gc_merge, "gc_merge"}, {Opt_nogc_merge, "nogc_merge"}, {Opt_discard_unit, "discard_unit=%s"}, + {Opt_memory_mode, "memory=%s"}, {Opt_err, NULL}, }; @@ -1211,6 +1213,22 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) } kfree(name); break; + case Opt_memory_mode: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + if (!strcmp(name, "normal")) { + F2FS_OPTION(sbi).memory_mode = + MEMORY_MODE_NORMAL; + } else if (!strcmp(name, "low")) { + F2FS_OPTION(sbi).memory_mode = + MEMORY_MODE_LOW; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; default: f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value", p); @@ -1986,6 +2004,11 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) seq_printf(seq, ",discard_unit=%s", "section"); + if (F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_NORMAL) + seq_printf(seq, ",memory=%s", "normal"); + else if (F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_LOW) + seq_printf(seq, ",memory=%s", "low"); + return 0; } @@ -2007,6 +2030,7 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).compress_ext_cnt = 0; F2FS_OPTION(sbi).compress_mode = COMPR_MODE_FS; F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON; + F2FS_OPTION(sbi).memory_mode = MEMORY_MODE_NORMAL; set_opt(sbi, INLINE_XATTR); set_opt(sbi, INLINE_DATA); From edbfe7380bb16f64d192b7ff1c53e79bdb657080 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Mon, 20 Jun 2022 10:38:43 -0700 Subject: [PATCH 444/580] f2fs: handle decompress only post processing in softirq Now decompression is being handled in workqueue and it makes read I/O latency non-deterministic, because of the non-deterministic scheduling nature of workqueues. So, I made it handled in softirq context only if possible, not in low memory devices, since this modification will maintain decompresion related memory a little longer. Signed-off-by: Daeho Jeong Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 222 +++++++++++++++++++++++++++------------------ fs/f2fs/data.c | 52 +++++++---- fs/f2fs/f2fs.h | 17 ++-- 3 files changed, 179 insertions(+), 112 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index eac3d13f9580..05ccd99f16f8 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -699,14 +699,107 @@ static int f2fs_compress_pages(struct compress_ctx *cc) return ret; } -void f2fs_decompress_cluster(struct decompress_io_ctx *dic) +static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic, bool end_io) +{ + const struct f2fs_compress_ops *cops = + f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm]; + int i; + + if (end_io ^ f2fs_low_mem_mode(F2FS_I_SB(dic->inode))) + return 0; + + dic->tpages = page_array_alloc(dic->inode, dic->cluster_size); + if (!dic->tpages) + return 1; + + for (i = 0; i < dic->cluster_size; i++) { + if (dic->rpages[i]) { + dic->tpages[i] = dic->rpages[i]; + continue; + } + + dic->tpages[i] = f2fs_compress_alloc_page(); + if (!dic->tpages[i]) + return 1; + } + + dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size); + if (!dic->rbuf) + return 1; + + dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages); + if (!dic->cbuf) + return 1; + + cops = f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm]; + if (cops->init_decompress_ctx) { + int ret = cops->init_decompress_ctx(dic); + + if (ret) + return 1; + } + + return 0; +} + +static void f2fs_release_decomp_mem(struct decompress_io_ctx *dic, + bool bypass_destroy_callback, bool end_io) +{ + const struct f2fs_compress_ops *cops = + f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm]; + + if (end_io ^ f2fs_low_mem_mode(F2FS_I_SB(dic->inode))) + return; + + if (!bypass_destroy_callback && cops->destroy_decompress_ctx) + cops->destroy_decompress_ctx(dic); + + if (dic->cbuf) + vm_unmap_ram(dic->cbuf, dic->nr_cpages); + + if (dic->rbuf) + vm_unmap_ram(dic->rbuf, dic->cluster_size); +} + +static void f2fs_free_dic(struct decompress_io_ctx *dic, + bool bypass_destroy_callback) +{ + int i; + + f2fs_release_decomp_mem(dic, bypass_destroy_callback, false); + + if (dic->tpages) { + for (i = 0; i < dic->cluster_size; i++) { + if (dic->rpages[i]) + continue; + if (!dic->tpages[i]) + continue; + f2fs_compress_free_page(dic->tpages[i]); + } + page_array_free(dic->inode, dic->tpages, dic->cluster_size); + } + + if (dic->cpages) { + for (i = 0; i < dic->nr_cpages; i++) { + if (!dic->cpages[i]) + continue; + f2fs_compress_free_page(dic->cpages[i]); + } + page_array_free(dic->inode, dic->cpages, dic->nr_cpages); + } + + page_array_free(dic->inode, dic->rpages, dic->nr_rpages); + kmem_cache_free(dic_entry_slab, dic); +} + +void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) { struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); struct f2fs_inode_info *fi = F2FS_I(dic->inode); const struct f2fs_compress_ops *cops = f2fs_cops[fi->i_compress_algorithm]; + bool bypass_callback = false; int ret; - int i; trace_f2fs_decompress_pages_start(dic->inode, dic->cluster_idx, dic->cluster_size, fi->i_compress_algorithm); @@ -716,41 +809,10 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic) goto out_end_io; } - dic->tpages = page_array_alloc(dic->inode, dic->cluster_size); - if (!dic->tpages) { + if (f2fs_prepare_decomp_mem(dic, true)) { + bypass_callback = true; ret = -ENOMEM; - goto out_end_io; - } - - for (i = 0; i < dic->cluster_size; i++) { - if (dic->rpages[i]) { - dic->tpages[i] = dic->rpages[i]; - continue; - } - - dic->tpages[i] = f2fs_compress_alloc_page(); - if (!dic->tpages[i]) { - ret = -ENOMEM; - goto out_end_io; - } - } - - if (cops->init_decompress_ctx) { - ret = cops->init_decompress_ctx(dic); - if (ret) - goto out_end_io; - } - - dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size); - if (!dic->rbuf) { - ret = -ENOMEM; - goto out_destroy_decompress_ctx; - } - - dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages); - if (!dic->cbuf) { - ret = -ENOMEM; - goto out_vunmap_rbuf; + goto out_release; } dic->clen = le32_to_cpu(dic->cbuf->clen); @@ -758,7 +820,7 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic) if (dic->clen > PAGE_SIZE * dic->nr_cpages - COMPRESS_HEADER_SIZE) { ret = -EFSCORRUPTED; - goto out_vunmap_cbuf; + goto out_release; } ret = cops->decompress_pages(dic); @@ -779,17 +841,13 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic) } } -out_vunmap_cbuf: - vm_unmap_ram(dic->cbuf, dic->nr_cpages); -out_vunmap_rbuf: - vm_unmap_ram(dic->rbuf, dic->cluster_size); -out_destroy_decompress_ctx: - if (cops->destroy_decompress_ctx) - cops->destroy_decompress_ctx(dic); +out_release: + f2fs_release_decomp_mem(dic, bypass_callback, true); + out_end_io: trace_f2fs_decompress_pages_end(dic->inode, dic->cluster_idx, dic->clen, ret); - f2fs_decompress_end_io(dic, ret); + f2fs_decompress_end_io(dic, ret, in_task); } /* @@ -799,7 +857,7 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic) * (or in the case of a failure, cleans up without actually decompressing). */ void f2fs_end_read_compressed_page(struct page *page, bool failed, - block_t blkaddr) + block_t blkaddr, bool in_task) { struct decompress_io_ctx *dic = (struct decompress_io_ctx *)page_private(page); @@ -809,12 +867,12 @@ void f2fs_end_read_compressed_page(struct page *page, bool failed, if (failed) WRITE_ONCE(dic->failed, true); - else if (blkaddr) + else if (blkaddr && in_task) f2fs_cache_compressed_page(sbi, page, dic->inode->i_ino, blkaddr); if (atomic_dec_and_test(&dic->remaining_pages)) - f2fs_decompress_cluster(dic); + f2fs_decompress_cluster(dic, in_task); } static bool is_page_in_cluster(struct compress_ctx *cc, pgoff_t index) @@ -1522,16 +1580,14 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, return err; } -static void f2fs_free_dic(struct decompress_io_ctx *dic); - struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) { struct decompress_io_ctx *dic; pgoff_t start_idx = start_idx_of_cluster(cc); + struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); int i; - dic = f2fs_kmem_cache_alloc(dic_entry_slab, GFP_F2FS_ZERO, - false, F2FS_I_SB(cc->inode)); + dic = f2fs_kmem_cache_alloc(dic_entry_slab, GFP_F2FS_ZERO, false, sbi); if (!dic) return ERR_PTR(-ENOMEM); @@ -1572,52 +1628,43 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) dic->cpages[i] = page; } + if (f2fs_prepare_decomp_mem(dic, false)) + goto out_free; + return dic; out_free: - f2fs_free_dic(dic); + f2fs_free_dic(dic, true); return ERR_PTR(-ENOMEM); } -static void f2fs_free_dic(struct decompress_io_ctx *dic) +static void f2fs_late_free_dic(struct work_struct *work) { - int i; + struct decompress_io_ctx *dic = + container_of(work, struct decompress_io_ctx, free_work); - if (dic->tpages) { - for (i = 0; i < dic->cluster_size; i++) { - if (dic->rpages[i]) - continue; - if (!dic->tpages[i]) - continue; - f2fs_compress_free_page(dic->tpages[i]); - } - page_array_free(dic->inode, dic->tpages, dic->cluster_size); - } - - if (dic->cpages) { - for (i = 0; i < dic->nr_cpages; i++) { - if (!dic->cpages[i]) - continue; - f2fs_compress_free_page(dic->cpages[i]); - } - page_array_free(dic->inode, dic->cpages, dic->nr_cpages); - } - - page_array_free(dic->inode, dic->rpages, dic->nr_rpages); - kmem_cache_free(dic_entry_slab, dic); + f2fs_free_dic(dic, false); } -static void f2fs_put_dic(struct decompress_io_ctx *dic) +static void f2fs_put_dic(struct decompress_io_ctx *dic, bool in_task) { - if (refcount_dec_and_test(&dic->refcnt)) - f2fs_free_dic(dic); + if (refcount_dec_and_test(&dic->refcnt)) { + if (in_task) { + f2fs_free_dic(dic, false); + } else { + INIT_WORK(&dic->free_work, f2fs_late_free_dic); + queue_work(F2FS_I_SB(dic->inode)->post_read_wq, + &dic->free_work); + } + } } /* * Update and unlock the cluster's pagecache pages, and release the reference to * the decompress_io_ctx that was being held for I/O completion. */ -static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed) +static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, + bool in_task) { int i; @@ -1638,7 +1685,7 @@ static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed) unlock_page(rpage); } - f2fs_put_dic(dic); + f2fs_put_dic(dic, in_task); } static void f2fs_verify_cluster(struct work_struct *work) @@ -1655,14 +1702,15 @@ static void f2fs_verify_cluster(struct work_struct *work) SetPageError(rpage); } - __f2fs_decompress_end_io(dic, false); + __f2fs_decompress_end_io(dic, false, true); } /* * This is called when a compressed cluster has been decompressed * (or failed to be read and/or decompressed). */ -void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed) +void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, + bool in_task) { if (!failed && dic->need_verity) { /* @@ -1674,7 +1722,7 @@ void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed) INIT_WORK(&dic->verity_work, f2fs_verify_cluster); fsverity_enqueue_verify_work(&dic->verity_work); } else { - __f2fs_decompress_end_io(dic, failed); + __f2fs_decompress_end_io(dic, failed, in_task); } } @@ -1683,12 +1731,12 @@ void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed) * * This is called when the page is no longer needed and can be freed. */ -void f2fs_put_page_dic(struct page *page) +void f2fs_put_page_dic(struct page *page, bool in_task) { struct decompress_io_ctx *dic = (struct decompress_io_ctx *)page_private(page); - f2fs_put_dic(dic); + f2fs_put_dic(dic, in_task); } /* diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 1c1451d18661..99e1afcbff51 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -117,7 +117,7 @@ struct bio_post_read_ctx { block_t fs_blkaddr; }; -static void f2fs_finish_read_bio(struct bio *bio) +static void f2fs_finish_read_bio(struct bio *bio, bool in_task) { struct bio_vec *bv; int iter_all; @@ -131,8 +131,9 @@ static void f2fs_finish_read_bio(struct bio *bio) if (f2fs_is_compressed_page(page)) { if (bio->bi_status) - f2fs_end_read_compressed_page(page, true, 0); - f2fs_put_page_dic(page); + f2fs_end_read_compressed_page(page, true, 0, + in_task); + f2fs_put_page_dic(page, in_task); continue; } @@ -189,7 +190,7 @@ static void f2fs_verify_bio(struct work_struct *work) fsverity_verify_bio(bio); } - f2fs_finish_read_bio(bio); + f2fs_finish_read_bio(bio, true); } /* @@ -201,7 +202,7 @@ static void f2fs_verify_bio(struct work_struct *work) * can involve reading verity metadata pages from the file, and these verity * metadata pages may be encrypted and/or compressed. */ -static void f2fs_verify_and_finish_bio(struct bio *bio) +static void f2fs_verify_and_finish_bio(struct bio *bio, bool in_task) { struct bio_post_read_ctx *ctx = bio->bi_private; @@ -209,7 +210,7 @@ static void f2fs_verify_and_finish_bio(struct bio *bio) INIT_WORK(&ctx->work, f2fs_verify_bio); fsverity_enqueue_verify_work(&ctx->work); } else { - f2fs_finish_read_bio(bio); + f2fs_finish_read_bio(bio, in_task); } } @@ -222,7 +223,8 @@ static void f2fs_verify_and_finish_bio(struct bio *bio) * that the bio includes at least one compressed page. The actual decompression * is done on a per-cluster basis, not a per-bio basis. */ -static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx) +static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx, + bool in_task) { struct bio_vec *bv; int iter_all; @@ -235,7 +237,7 @@ static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx) /* PG_error was set if decryption failed. */ if (f2fs_is_compressed_page(page)) f2fs_end_read_compressed_page(page, PageError(page), - blkaddr); + blkaddr, in_task); else all_compressed = false; @@ -260,15 +262,16 @@ static void f2fs_post_read_work(struct work_struct *work) fscrypt_decrypt_bio(ctx->bio); if (ctx->enabled_steps & STEP_DECOMPRESS) - f2fs_handle_step_decompress(ctx); + f2fs_handle_step_decompress(ctx, true); - f2fs_verify_and_finish_bio(ctx->bio); + f2fs_verify_and_finish_bio(ctx->bio, true); } static void f2fs_read_end_io(struct bio *bio) { struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio)); struct bio_post_read_ctx *ctx; + bool intask = in_task(); iostat_update_and_unbind_ctx(bio, 0); ctx = bio->bi_private; @@ -279,16 +282,29 @@ static void f2fs_read_end_io(struct bio *bio) } if (bio->bi_status) { - f2fs_finish_read_bio(bio); + f2fs_finish_read_bio(bio, intask); return; } - if (ctx && (ctx->enabled_steps & (STEP_DECRYPT | STEP_DECOMPRESS))) { - INIT_WORK(&ctx->work, f2fs_post_read_work); - queue_work(ctx->sbi->post_read_wq, &ctx->work); - } else { - f2fs_verify_and_finish_bio(bio); + if (ctx) { + unsigned int enabled_steps = ctx->enabled_steps & + (STEP_DECRYPT | STEP_DECOMPRESS); + + /* + * If we have only decompression step between decompression and + * decrypt, we don't need post processing for this. + */ + if (enabled_steps == STEP_DECOMPRESS && + !f2fs_low_mem_mode(sbi)) { + f2fs_handle_step_decompress(ctx, intask); + } else if (enabled_steps) { + INIT_WORK(&ctx->work, f2fs_post_read_work); + queue_work(ctx->sbi->post_read_wq, &ctx->work); + return; + } } + + f2fs_verify_and_finish_bio(bio, intask); } static void f2fs_write_end_io(struct bio *bio) @@ -2247,7 +2263,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, if (f2fs_load_compressed_page(sbi, page, blkaddr)) { if (atomic_dec_and_test(&dic->remaining_pages)) - f2fs_decompress_cluster(dic); + f2fs_decompress_cluster(dic, true); continue; } @@ -2264,7 +2280,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, page->index, for_write); if (IS_ERR(bio)) { ret = PTR_ERR(bio); - f2fs_decompress_end_io(dic, ret); + f2fs_decompress_end_io(dic, ret, true); f2fs_put_dnode(&dn); *bio_ret = NULL; return ret; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 43ab2a0831a3..50ecc7599d39 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1591,6 +1591,7 @@ struct decompress_io_ctx { void *private; /* payload buffer for specified decompression algorithm */ void *private2; /* extra payload buffer */ struct work_struct verity_work; /* work to verify the decompressed pages */ + struct work_struct free_work; /* work for late free this structure itself */ }; #define NULL_CLUSTER ((unsigned int)(~0)) @@ -4197,9 +4198,9 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page); bool f2fs_is_compress_backend_ready(struct inode *inode); int f2fs_init_compress_mempool(void); void f2fs_destroy_compress_mempool(void); -void f2fs_decompress_cluster(struct decompress_io_ctx *dic); +void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task); void f2fs_end_read_compressed_page(struct page *page, bool failed, - block_t blkaddr); + block_t blkaddr, bool in_task); bool f2fs_cluster_is_empty(struct compress_ctx *cc); bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index); bool f2fs_all_cluster_page_loaded(struct compress_ctx *cc, struct pagevec *pvec, @@ -4218,8 +4219,9 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, unsigned nr_pages, sector_t *last_block_in_bio, bool is_readahead, bool for_write); struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc); -void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed); -void f2fs_put_page_dic(struct page *page); +void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, + bool in_task); +void f2fs_put_page_dic(struct page *page, bool in_task); unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn); int f2fs_init_compress_ctx(struct compress_ctx *cc); void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse); @@ -4265,13 +4267,14 @@ static inline struct page *f2fs_compress_control_page(struct page *page) } static inline int f2fs_init_compress_mempool(void) { return 0; } static inline void f2fs_destroy_compress_mempool(void) { } -static inline void f2fs_decompress_cluster(struct decompress_io_ctx *dic) { } +static inline void f2fs_decompress_cluster(struct decompress_io_ctx *dic, + bool in_task) { } static inline void f2fs_end_read_compressed_page(struct page *page, - bool failed, block_t blkaddr) + bool failed, block_t blkaddr, bool in_task) { WARN_ON_ONCE(1); } -static inline void f2fs_put_page_dic(struct page *page) +static inline void f2fs_put_page_dic(struct page *page, bool in_task) { WARN_ON_ONCE(1); } From 0e5ad561100b638d8ce2ce54a6623fe637cc8bbb Mon Sep 17 00:00:00 2001 From: duguowei Date: Mon, 20 Jun 2022 21:39:45 +0800 Subject: [PATCH 445/580] f2fs: remove redundant code for gc condition Remove the redundant code and use local variant as the argument directly. Make it more human-readable. Signed-off-by: duguowei [Jaegeuk Kim: make code neat] Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.h | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 3fe145e8e594..19b956c2d697 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -120,15 +120,13 @@ static inline block_t free_user_blocks(struct f2fs_sb_info *sbi) return free_blks - ovp_blks; } -static inline block_t limit_invalid_user_blocks(struct f2fs_sb_info *sbi) +static inline block_t limit_invalid_user_blocks(block_t user_block_count) { - return (long)(sbi->user_block_count * LIMIT_INVALID_BLOCK) / 100; + return (long)(user_block_count * LIMIT_INVALID_BLOCK) / 100; } -static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi) +static inline block_t limit_free_user_blocks(block_t reclaimable_user_blocks) { - block_t reclaimable_user_blocks = sbi->user_block_count - - written_block_count(sbi); return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100; } @@ -163,15 +161,16 @@ static inline void decrease_sleep_time(struct f2fs_gc_kthread *gc_th, static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) { - block_t invalid_user_blocks = sbi->user_block_count - - written_block_count(sbi); + block_t user_block_count = sbi->user_block_count; + block_t invalid_user_blocks = user_block_count - + written_block_count(sbi); /* * Background GC is triggered with the following conditions. * 1. There are a number of invalid blocks. * 2. There is not enough free space. */ - if (invalid_user_blocks > limit_invalid_user_blocks(sbi) && - free_user_blocks(sbi) < limit_free_user_blocks(sbi)) - return true; - return false; + return (invalid_user_blocks > + limit_invalid_user_blocks(user_block_count) && + free_user_blocks(sbi) < + limit_free_user_blocks(invalid_user_blocks)); } From c4f620ec7f4dfa5087fe170b7daae23b84a5cf81 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 8 Aug 2022 17:17:38 -0700 Subject: [PATCH 446/580] Revert "f2fs: run GCs synchronously given user requests" This reverts commit c9a5d0a14799f1c3adb7618f9a616e0dddb371c1. --- fs/f2fs/gc.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index f55f41802dc9..818612e421bb 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -37,6 +37,7 @@ static int gc_thread_func(void *data) unsigned int wait_ms; struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, + .should_migrate_blocks = false, .err_gc_skipped = false }; wait_ms = gc_th->min_sleep_time; @@ -112,10 +113,7 @@ static int gc_thread_func(void *data) sbi->gc_mode == GC_URGENT_MID) { wait_ms = gc_th->urgent_sleep_time; f2fs_down_write(&sbi->gc_lock); - gc_control.should_migrate_blocks = true; goto do_gc; - } else { - gc_control.should_migrate_blocks = false; } if (foreground) { @@ -141,9 +139,7 @@ static int gc_thread_func(void *data) if (!foreground) stat_inc_bggc_count(sbi->stat_info); - sync_mode = F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC || - sbi->gc_mode == GC_URGENT_HIGH || - sbi->gc_mode == GC_URGENT_MID; + sync_mode = F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC; /* foreground GC was been triggered via f2fs_balance_fs() */ if (foreground) From a09cf47c4e4ad2dc39b8ffaabf3136631ea45a21 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 8 Aug 2022 17:16:28 -0700 Subject: [PATCH 447/580] Revert "f2fs: handle decompress only post processing in softirq" This reverts commit edbfe7380bb16f64d192b7ff1c53e79bdb657080. --- fs/f2fs/compress.c | 222 ++++++++++++++++++--------------------------- fs/f2fs/data.c | 52 ++++------- fs/f2fs/f2fs.h | 17 ++-- 3 files changed, 112 insertions(+), 179 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 05ccd99f16f8..eac3d13f9580 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -699,107 +699,14 @@ static int f2fs_compress_pages(struct compress_ctx *cc) return ret; } -static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic, bool end_io) -{ - const struct f2fs_compress_ops *cops = - f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm]; - int i; - - if (end_io ^ f2fs_low_mem_mode(F2FS_I_SB(dic->inode))) - return 0; - - dic->tpages = page_array_alloc(dic->inode, dic->cluster_size); - if (!dic->tpages) - return 1; - - for (i = 0; i < dic->cluster_size; i++) { - if (dic->rpages[i]) { - dic->tpages[i] = dic->rpages[i]; - continue; - } - - dic->tpages[i] = f2fs_compress_alloc_page(); - if (!dic->tpages[i]) - return 1; - } - - dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size); - if (!dic->rbuf) - return 1; - - dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages); - if (!dic->cbuf) - return 1; - - cops = f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm]; - if (cops->init_decompress_ctx) { - int ret = cops->init_decompress_ctx(dic); - - if (ret) - return 1; - } - - return 0; -} - -static void f2fs_release_decomp_mem(struct decompress_io_ctx *dic, - bool bypass_destroy_callback, bool end_io) -{ - const struct f2fs_compress_ops *cops = - f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm]; - - if (end_io ^ f2fs_low_mem_mode(F2FS_I_SB(dic->inode))) - return; - - if (!bypass_destroy_callback && cops->destroy_decompress_ctx) - cops->destroy_decompress_ctx(dic); - - if (dic->cbuf) - vm_unmap_ram(dic->cbuf, dic->nr_cpages); - - if (dic->rbuf) - vm_unmap_ram(dic->rbuf, dic->cluster_size); -} - -static void f2fs_free_dic(struct decompress_io_ctx *dic, - bool bypass_destroy_callback) -{ - int i; - - f2fs_release_decomp_mem(dic, bypass_destroy_callback, false); - - if (dic->tpages) { - for (i = 0; i < dic->cluster_size; i++) { - if (dic->rpages[i]) - continue; - if (!dic->tpages[i]) - continue; - f2fs_compress_free_page(dic->tpages[i]); - } - page_array_free(dic->inode, dic->tpages, dic->cluster_size); - } - - if (dic->cpages) { - for (i = 0; i < dic->nr_cpages; i++) { - if (!dic->cpages[i]) - continue; - f2fs_compress_free_page(dic->cpages[i]); - } - page_array_free(dic->inode, dic->cpages, dic->nr_cpages); - } - - page_array_free(dic->inode, dic->rpages, dic->nr_rpages); - kmem_cache_free(dic_entry_slab, dic); -} - -void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) +void f2fs_decompress_cluster(struct decompress_io_ctx *dic) { struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); struct f2fs_inode_info *fi = F2FS_I(dic->inode); const struct f2fs_compress_ops *cops = f2fs_cops[fi->i_compress_algorithm]; - bool bypass_callback = false; int ret; + int i; trace_f2fs_decompress_pages_start(dic->inode, dic->cluster_idx, dic->cluster_size, fi->i_compress_algorithm); @@ -809,10 +716,41 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) goto out_end_io; } - if (f2fs_prepare_decomp_mem(dic, true)) { - bypass_callback = true; + dic->tpages = page_array_alloc(dic->inode, dic->cluster_size); + if (!dic->tpages) { ret = -ENOMEM; - goto out_release; + goto out_end_io; + } + + for (i = 0; i < dic->cluster_size; i++) { + if (dic->rpages[i]) { + dic->tpages[i] = dic->rpages[i]; + continue; + } + + dic->tpages[i] = f2fs_compress_alloc_page(); + if (!dic->tpages[i]) { + ret = -ENOMEM; + goto out_end_io; + } + } + + if (cops->init_decompress_ctx) { + ret = cops->init_decompress_ctx(dic); + if (ret) + goto out_end_io; + } + + dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size); + if (!dic->rbuf) { + ret = -ENOMEM; + goto out_destroy_decompress_ctx; + } + + dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages); + if (!dic->cbuf) { + ret = -ENOMEM; + goto out_vunmap_rbuf; } dic->clen = le32_to_cpu(dic->cbuf->clen); @@ -820,7 +758,7 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) if (dic->clen > PAGE_SIZE * dic->nr_cpages - COMPRESS_HEADER_SIZE) { ret = -EFSCORRUPTED; - goto out_release; + goto out_vunmap_cbuf; } ret = cops->decompress_pages(dic); @@ -841,13 +779,17 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) } } -out_release: - f2fs_release_decomp_mem(dic, bypass_callback, true); - +out_vunmap_cbuf: + vm_unmap_ram(dic->cbuf, dic->nr_cpages); +out_vunmap_rbuf: + vm_unmap_ram(dic->rbuf, dic->cluster_size); +out_destroy_decompress_ctx: + if (cops->destroy_decompress_ctx) + cops->destroy_decompress_ctx(dic); out_end_io: trace_f2fs_decompress_pages_end(dic->inode, dic->cluster_idx, dic->clen, ret); - f2fs_decompress_end_io(dic, ret, in_task); + f2fs_decompress_end_io(dic, ret); } /* @@ -857,7 +799,7 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) * (or in the case of a failure, cleans up without actually decompressing). */ void f2fs_end_read_compressed_page(struct page *page, bool failed, - block_t blkaddr, bool in_task) + block_t blkaddr) { struct decompress_io_ctx *dic = (struct decompress_io_ctx *)page_private(page); @@ -867,12 +809,12 @@ void f2fs_end_read_compressed_page(struct page *page, bool failed, if (failed) WRITE_ONCE(dic->failed, true); - else if (blkaddr && in_task) + else if (blkaddr) f2fs_cache_compressed_page(sbi, page, dic->inode->i_ino, blkaddr); if (atomic_dec_and_test(&dic->remaining_pages)) - f2fs_decompress_cluster(dic, in_task); + f2fs_decompress_cluster(dic); } static bool is_page_in_cluster(struct compress_ctx *cc, pgoff_t index) @@ -1580,14 +1522,16 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, return err; } +static void f2fs_free_dic(struct decompress_io_ctx *dic); + struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) { struct decompress_io_ctx *dic; pgoff_t start_idx = start_idx_of_cluster(cc); - struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); int i; - dic = f2fs_kmem_cache_alloc(dic_entry_slab, GFP_F2FS_ZERO, false, sbi); + dic = f2fs_kmem_cache_alloc(dic_entry_slab, GFP_F2FS_ZERO, + false, F2FS_I_SB(cc->inode)); if (!dic) return ERR_PTR(-ENOMEM); @@ -1628,43 +1572,52 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) dic->cpages[i] = page; } - if (f2fs_prepare_decomp_mem(dic, false)) - goto out_free; - return dic; out_free: - f2fs_free_dic(dic, true); + f2fs_free_dic(dic); return ERR_PTR(-ENOMEM); } -static void f2fs_late_free_dic(struct work_struct *work) +static void f2fs_free_dic(struct decompress_io_ctx *dic) { - struct decompress_io_ctx *dic = - container_of(work, struct decompress_io_ctx, free_work); + int i; - f2fs_free_dic(dic, false); + if (dic->tpages) { + for (i = 0; i < dic->cluster_size; i++) { + if (dic->rpages[i]) + continue; + if (!dic->tpages[i]) + continue; + f2fs_compress_free_page(dic->tpages[i]); + } + page_array_free(dic->inode, dic->tpages, dic->cluster_size); + } + + if (dic->cpages) { + for (i = 0; i < dic->nr_cpages; i++) { + if (!dic->cpages[i]) + continue; + f2fs_compress_free_page(dic->cpages[i]); + } + page_array_free(dic->inode, dic->cpages, dic->nr_cpages); + } + + page_array_free(dic->inode, dic->rpages, dic->nr_rpages); + kmem_cache_free(dic_entry_slab, dic); } -static void f2fs_put_dic(struct decompress_io_ctx *dic, bool in_task) +static void f2fs_put_dic(struct decompress_io_ctx *dic) { - if (refcount_dec_and_test(&dic->refcnt)) { - if (in_task) { - f2fs_free_dic(dic, false); - } else { - INIT_WORK(&dic->free_work, f2fs_late_free_dic); - queue_work(F2FS_I_SB(dic->inode)->post_read_wq, - &dic->free_work); - } - } + if (refcount_dec_and_test(&dic->refcnt)) + f2fs_free_dic(dic); } /* * Update and unlock the cluster's pagecache pages, and release the reference to * the decompress_io_ctx that was being held for I/O completion. */ -static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, - bool in_task) +static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed) { int i; @@ -1685,7 +1638,7 @@ static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, unlock_page(rpage); } - f2fs_put_dic(dic, in_task); + f2fs_put_dic(dic); } static void f2fs_verify_cluster(struct work_struct *work) @@ -1702,15 +1655,14 @@ static void f2fs_verify_cluster(struct work_struct *work) SetPageError(rpage); } - __f2fs_decompress_end_io(dic, false, true); + __f2fs_decompress_end_io(dic, false); } /* * This is called when a compressed cluster has been decompressed * (or failed to be read and/or decompressed). */ -void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, - bool in_task) +void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed) { if (!failed && dic->need_verity) { /* @@ -1722,7 +1674,7 @@ void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, INIT_WORK(&dic->verity_work, f2fs_verify_cluster); fsverity_enqueue_verify_work(&dic->verity_work); } else { - __f2fs_decompress_end_io(dic, failed, in_task); + __f2fs_decompress_end_io(dic, failed); } } @@ -1731,12 +1683,12 @@ void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, * * This is called when the page is no longer needed and can be freed. */ -void f2fs_put_page_dic(struct page *page, bool in_task) +void f2fs_put_page_dic(struct page *page) { struct decompress_io_ctx *dic = (struct decompress_io_ctx *)page_private(page); - f2fs_put_dic(dic, in_task); + f2fs_put_dic(dic); } /* diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 99e1afcbff51..1c1451d18661 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -117,7 +117,7 @@ struct bio_post_read_ctx { block_t fs_blkaddr; }; -static void f2fs_finish_read_bio(struct bio *bio, bool in_task) +static void f2fs_finish_read_bio(struct bio *bio) { struct bio_vec *bv; int iter_all; @@ -131,9 +131,8 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task) if (f2fs_is_compressed_page(page)) { if (bio->bi_status) - f2fs_end_read_compressed_page(page, true, 0, - in_task); - f2fs_put_page_dic(page, in_task); + f2fs_end_read_compressed_page(page, true, 0); + f2fs_put_page_dic(page); continue; } @@ -190,7 +189,7 @@ static void f2fs_verify_bio(struct work_struct *work) fsverity_verify_bio(bio); } - f2fs_finish_read_bio(bio, true); + f2fs_finish_read_bio(bio); } /* @@ -202,7 +201,7 @@ static void f2fs_verify_bio(struct work_struct *work) * can involve reading verity metadata pages from the file, and these verity * metadata pages may be encrypted and/or compressed. */ -static void f2fs_verify_and_finish_bio(struct bio *bio, bool in_task) +static void f2fs_verify_and_finish_bio(struct bio *bio) { struct bio_post_read_ctx *ctx = bio->bi_private; @@ -210,7 +209,7 @@ static void f2fs_verify_and_finish_bio(struct bio *bio, bool in_task) INIT_WORK(&ctx->work, f2fs_verify_bio); fsverity_enqueue_verify_work(&ctx->work); } else { - f2fs_finish_read_bio(bio, in_task); + f2fs_finish_read_bio(bio); } } @@ -223,8 +222,7 @@ static void f2fs_verify_and_finish_bio(struct bio *bio, bool in_task) * that the bio includes at least one compressed page. The actual decompression * is done on a per-cluster basis, not a per-bio basis. */ -static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx, - bool in_task) +static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx) { struct bio_vec *bv; int iter_all; @@ -237,7 +235,7 @@ static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx, /* PG_error was set if decryption failed. */ if (f2fs_is_compressed_page(page)) f2fs_end_read_compressed_page(page, PageError(page), - blkaddr, in_task); + blkaddr); else all_compressed = false; @@ -262,16 +260,15 @@ static void f2fs_post_read_work(struct work_struct *work) fscrypt_decrypt_bio(ctx->bio); if (ctx->enabled_steps & STEP_DECOMPRESS) - f2fs_handle_step_decompress(ctx, true); + f2fs_handle_step_decompress(ctx); - f2fs_verify_and_finish_bio(ctx->bio, true); + f2fs_verify_and_finish_bio(ctx->bio); } static void f2fs_read_end_io(struct bio *bio) { struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio)); struct bio_post_read_ctx *ctx; - bool intask = in_task(); iostat_update_and_unbind_ctx(bio, 0); ctx = bio->bi_private; @@ -282,29 +279,16 @@ static void f2fs_read_end_io(struct bio *bio) } if (bio->bi_status) { - f2fs_finish_read_bio(bio, intask); + f2fs_finish_read_bio(bio); return; } - if (ctx) { - unsigned int enabled_steps = ctx->enabled_steps & - (STEP_DECRYPT | STEP_DECOMPRESS); - - /* - * If we have only decompression step between decompression and - * decrypt, we don't need post processing for this. - */ - if (enabled_steps == STEP_DECOMPRESS && - !f2fs_low_mem_mode(sbi)) { - f2fs_handle_step_decompress(ctx, intask); - } else if (enabled_steps) { - INIT_WORK(&ctx->work, f2fs_post_read_work); - queue_work(ctx->sbi->post_read_wq, &ctx->work); - return; - } + if (ctx && (ctx->enabled_steps & (STEP_DECRYPT | STEP_DECOMPRESS))) { + INIT_WORK(&ctx->work, f2fs_post_read_work); + queue_work(ctx->sbi->post_read_wq, &ctx->work); + } else { + f2fs_verify_and_finish_bio(bio); } - - f2fs_verify_and_finish_bio(bio, intask); } static void f2fs_write_end_io(struct bio *bio) @@ -2263,7 +2247,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, if (f2fs_load_compressed_page(sbi, page, blkaddr)) { if (atomic_dec_and_test(&dic->remaining_pages)) - f2fs_decompress_cluster(dic, true); + f2fs_decompress_cluster(dic); continue; } @@ -2280,7 +2264,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, page->index, for_write); if (IS_ERR(bio)) { ret = PTR_ERR(bio); - f2fs_decompress_end_io(dic, ret, true); + f2fs_decompress_end_io(dic, ret); f2fs_put_dnode(&dn); *bio_ret = NULL; return ret; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 50ecc7599d39..43ab2a0831a3 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1591,7 +1591,6 @@ struct decompress_io_ctx { void *private; /* payload buffer for specified decompression algorithm */ void *private2; /* extra payload buffer */ struct work_struct verity_work; /* work to verify the decompressed pages */ - struct work_struct free_work; /* work for late free this structure itself */ }; #define NULL_CLUSTER ((unsigned int)(~0)) @@ -4198,9 +4197,9 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page); bool f2fs_is_compress_backend_ready(struct inode *inode); int f2fs_init_compress_mempool(void); void f2fs_destroy_compress_mempool(void); -void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task); +void f2fs_decompress_cluster(struct decompress_io_ctx *dic); void f2fs_end_read_compressed_page(struct page *page, bool failed, - block_t blkaddr, bool in_task); + block_t blkaddr); bool f2fs_cluster_is_empty(struct compress_ctx *cc); bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index); bool f2fs_all_cluster_page_loaded(struct compress_ctx *cc, struct pagevec *pvec, @@ -4219,9 +4218,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, unsigned nr_pages, sector_t *last_block_in_bio, bool is_readahead, bool for_write); struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc); -void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, - bool in_task); -void f2fs_put_page_dic(struct page *page, bool in_task); +void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed); +void f2fs_put_page_dic(struct page *page); unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn); int f2fs_init_compress_ctx(struct compress_ctx *cc); void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse); @@ -4267,14 +4265,13 @@ static inline struct page *f2fs_compress_control_page(struct page *page) } static inline int f2fs_init_compress_mempool(void) { return 0; } static inline void f2fs_destroy_compress_mempool(void) { } -static inline void f2fs_decompress_cluster(struct decompress_io_ctx *dic, - bool in_task) { } +static inline void f2fs_decompress_cluster(struct decompress_io_ctx *dic) { } static inline void f2fs_end_read_compressed_page(struct page *page, - bool failed, block_t blkaddr, bool in_task) + bool failed, block_t blkaddr) { WARN_ON_ONCE(1); } -static inline void f2fs_put_page_dic(struct page *page, bool in_task) +static inline void f2fs_put_page_dic(struct page *page) { WARN_ON_ONCE(1); } From aebb04a23275196ec763e5eaccd177fd18d92c6f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 28 Jun 2022 10:57:24 -0700 Subject: [PATCH 448/580] f2fs: enforce single zone capacity In order to simplify the complicated per-zone capacity, let's support only one capacity for entire zoned device. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/segment.h | 3 +++ 2 files changed, 4 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 43ab2a0831a3..7914007cdb8a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1675,6 +1675,7 @@ struct f2fs_sb_info { unsigned int meta_ino_num; /* meta inode number*/ unsigned int log_blocks_per_seg; /* log2 blocks per segment */ unsigned int blocks_per_seg; /* blocks per segment */ + unsigned int unusable_blocks_per_sec; /* unusable blocks per section */ unsigned int segs_per_sec; /* segments per section */ unsigned int secs_per_zone; /* sections per zone */ unsigned int total_sections; /* total section count */ diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 00d6bc556a6b..6c3f82d519ea 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -101,6 +101,9 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, GET_SEGNO_FROM_SEG0(sbi, blk_addr))) #define BLKS_PER_SEC(sbi) \ ((sbi)->segs_per_sec * (sbi)->blocks_per_seg) +#define CAP_BLKS_PER_SEC(sbi) \ + ((sbi)->segs_per_sec * (sbi)->blocks_per_seg - \ + (sbi)->unusable_blocks_per_sec) #define GET_SEC_FROM_SEG(sbi, segno) \ (((segno) == -1) ? -1: (segno) / (sbi)->segs_per_sec) #define GET_SEG_FROM_SEC(sbi, secno) \ From 86225a0169e084f963ad3ffec4a15cb0f0232b47 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 28 Jun 2022 11:03:57 -0700 Subject: [PATCH 449/580] f2fs: adjust zone capacity when considering valid block count This patch fixes counting unusable blocks set by zone capacity when checking the valid block count in a section. Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 2 +- fs/f2fs/file.c | 6 +++--- fs/f2fs/gc.c | 4 ++-- fs/f2fs/segment.c | 7 +++---- fs/f2fs/segment.h | 8 ++++---- 5 files changed, 13 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index c92625ef16d0..c01471573977 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -39,7 +39,7 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi) bimodal = 0; total_vblocks = 0; - blks_per_sec = BLKS_PER_SEC(sbi); + blks_per_sec = CAP_BLKS_PER_SEC(sbi); hblks_per_sec = blks_per_sec / 2; for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { vblocks = get_valid_blocks(sbi, segno, true); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 760de3e4fb50..07bfc1c3a3cb 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1698,7 +1698,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset, return 0; if (f2fs_is_pinned_file(inode)) { - block_t sec_blks = BLKS_PER_SEC(sbi); + block_t sec_blks = CAP_BLKS_PER_SEC(sbi); block_t sec_len = roundup(map.m_len, sec_blks); map.m_len = sec_blks; @@ -2506,7 +2506,7 @@ static int __f2fs_ioc_gc_range(struct file *filp, struct f2fs_gc_range *range) ret = -EAGAIN; goto out; } - range->start += BLKS_PER_SEC(sbi); + range->start += CAP_BLKS_PER_SEC(sbi); if (range->start <= end) goto do_more; out: @@ -2631,7 +2631,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, goto out; } - sec_num = DIV_ROUND_UP(total, BLKS_PER_SEC(sbi)); + sec_num = DIV_ROUND_UP(total, CAP_BLKS_PER_SEC(sbi)); /* * make sure there are enough free section for LFS allocation, this can diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 818612e421bb..626baec9dd1b 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -487,7 +487,7 @@ static void atgc_lookup_victim(struct f2fs_sb_info *sbi, unsigned long long age, u, accu; unsigned long long max_mtime = sit_i->dirty_max_mtime; unsigned long long min_mtime = sit_i->dirty_min_mtime; - unsigned int sec_blocks = BLKS_PER_SEC(sbi); + unsigned int sec_blocks = CAP_BLKS_PER_SEC(sbi); unsigned int vblocks; unsigned int dirty_threshold = max(am->max_candidate_count, am->candidate_ratio * @@ -1488,7 +1488,7 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, */ if ((gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) || (!force_migrate && get_valid_blocks(sbi, segno, true) == - BLKS_PER_SEC(sbi))) + CAP_BLKS_PER_SEC(sbi))) return submitted; if (check_valid_map(sbi, segno, off) == 0) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ba5bd119eb1c..504e267a8685 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -727,7 +727,7 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, get_valid_blocks(sbi, segno, true); f2fs_bug_on(sbi, unlikely(!valid_blocks || - valid_blocks == BLKS_PER_SEC(sbi))); + valid_blocks == CAP_BLKS_PER_SEC(sbi))); if (!IS_CURSEC(sbi, secno)) set_bit(secno, dirty_i->dirty_secmap); @@ -763,7 +763,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); if (!valid_blocks || - valid_blocks == BLKS_PER_SEC(sbi)) { + valid_blocks == CAP_BLKS_PER_SEC(sbi)) { clear_bit(secno, dirty_i->dirty_secmap); return; } @@ -4483,7 +4483,6 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) struct free_segmap_info *free_i = FREE_I(sbi); unsigned int segno = 0, offset = 0, secno; block_t valid_blocks, usable_blks_in_seg; - block_t blks_per_sec = BLKS_PER_SEC(sbi); while (1) { /* find dirty segment based on free segmap */ @@ -4512,7 +4511,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) valid_blocks = get_valid_blocks(sbi, segno, true); secno = GET_SEC_FROM_SEG(sbi, segno); - if (!valid_blocks || valid_blocks == blks_per_sec) + if (!valid_blocks || valid_blocks == CAP_BLKS_PER_SEC(sbi)) continue; if (IS_CURSEC(sbi, secno)) continue; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 6c3f82d519ea..6b605af2b87d 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -612,10 +612,10 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, get_pages(sbi, F2FS_DIRTY_DENTS) + get_pages(sbi, F2FS_DIRTY_IMETA); unsigned int total_dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS); - unsigned int node_secs = total_node_blocks / BLKS_PER_SEC(sbi); - unsigned int dent_secs = total_dent_blocks / BLKS_PER_SEC(sbi); - unsigned int node_blocks = total_node_blocks % BLKS_PER_SEC(sbi); - unsigned int dent_blocks = total_dent_blocks % BLKS_PER_SEC(sbi); + unsigned int node_secs = total_node_blocks / CAP_BLKS_PER_SEC(sbi); + unsigned int dent_secs = total_dent_blocks / CAP_BLKS_PER_SEC(sbi); + unsigned int node_blocks = total_node_blocks % CAP_BLKS_PER_SEC(sbi); + unsigned int dent_blocks = total_dent_blocks % CAP_BLKS_PER_SEC(sbi); unsigned int free, need_lower, need_upper; if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) From 1f8589824094b5b5e811c0f87052a1ac9c25a287 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 6 Jul 2022 14:30:15 +0800 Subject: [PATCH 450/580] f2fs: fix to invalidate META_MAPPING before DIO write Quoted from commit e3b49ea36802 ("f2fs: invalidate META_MAPPING before IPU/DIO write") " Encrypted pages during GC are read and cached in META_MAPPING. However, due to cached pages in META_MAPPING, there is an issue where newly written pages are lost by IPU or DIO writes. Thread A - f2fs_gc() Thread B /* phase 3 */ down_write(i_gc_rwsem) ra_data_block() ---- (a) up_write(i_gc_rwsem) f2fs_direct_IO() : - down_read(i_gc_rwsem) - __blockdev_direct_io() - get_data_block_dio_write() - f2fs_dio_submit_bio() ---- (b) - up_read(i_gc_rwsem) /* phase 4 */ down_write(i_gc_rwsem) move_data_block() ---- (c) up_write(i_gc_rwsem) (a) In phase 3 of f2fs_gc(), up-to-date page is read from storage and cached in META_MAPPING. (b) In thread B, writing new data by IPU or DIO write on same blkaddr as read in (a). cached page in META_MAPPING become out-dated. (c) In phase 4 of f2fs_gc(), out-dated page in META_MAPPING is copied to new blkaddr. In conclusion, the newly written data in (b) is lost. To address this issue, invalidating pages in META_MAPPING before IPU or DIO write. " In previous commit, we missed to cover extent cache hit case, and passed wrong value for parameter @end of invalidate_mapping_pages(), fix both issues. Fixes: 6aa58d8ad20a ("f2fs: readahead encrypted block during GC") Fixes: e3b49ea36802 ("f2fs: invalidate META_MAPPING before IPU/DIO write") Cc: Hyeong-Jun Kim Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 1c1451d18661..89bc19684da0 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1433,9 +1433,12 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, *map->m_next_extent = pgofs + map->m_len; /* for hardware encryption, but to avoid potential issue in future */ - if (flag == F2FS_GET_BLOCK_DIO) + if (flag == F2FS_GET_BLOCK_DIO) { f2fs_wait_on_block_writeback_range(inode, map->m_pblk, map->m_len); + invalidate_mapping_pages(META_MAPPING(sbi), + map->m_pblk, map->m_pblk + map->m_len - 1); + } if (map->m_multidev_dio) { block_t blk_addr = map->m_pblk; @@ -1652,7 +1655,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, f2fs_wait_on_block_writeback_range(inode, map->m_pblk, map->m_len); invalidate_mapping_pages(META_MAPPING(sbi), - map->m_pblk, map->m_pblk); + map->m_pblk, map->m_pblk + map->m_len - 1); if (map->m_multidev_dio) { block_t blk_addr = map->m_pblk; From 45179d7fba31e40299327d3140f78bbff2c14862 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 12 Jul 2022 11:17:15 +0800 Subject: [PATCH 451/580] f2fs: fix to check inline_data during compressed inode conversion When converting inode to compressed one via ioctl, it needs to check inline_data, since inline_data flag and compressed flag are incompatible. Fixes: 4c8ff7095bef ("f2fs: support data compression") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7914007cdb8a..d7dd609404db 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4463,7 +4463,7 @@ static inline bool f2fs_low_mem_mode(struct f2fs_sb_info *sbi) static inline bool f2fs_may_compress(struct inode *inode) { if (IS_SWAPFILE(inode) || f2fs_is_pinned_file(inode) || - f2fs_is_atomic_file(inode)) + f2fs_is_atomic_file(inode) || f2fs_has_inline_data(inode)) return false; return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode); } From 953b00d867b323f7cb322058f17e7c92398a5b07 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 7 Jul 2022 17:09:24 +0800 Subject: [PATCH 452/580] f2fs: allow compression of files without blocks Files created by truncate(1) have a size but no blocks, so they can be allowed to enable compression. Signed-off-by: Chao Liu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 07bfc1c3a3cb..e3bf943de823 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1892,7 +1892,7 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) if (iflags & F2FS_COMPR_FL) { if (!f2fs_may_compress(inode)) return -EINVAL; - if (S_ISREG(inode->i_mode) && inode->i_size) + if (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode)) return -EINVAL; set_compress_context(inode); From bcc8f86a71709a95e7054a3b9093526a78a41a6f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 12 Jul 2022 23:26:43 +0800 Subject: [PATCH 453/580] f2fs: invalidate meta pages only for post_read required inode After commit e3b49ea36802 ("f2fs: invalidate META_MAPPING before IPU/DIO write"), invalidate_mapping_pages() will be called to avoid race condition in between IPU/DIO and readahead for GC. However, readahead flow is only used for post_read required inode, so this patch adds check condition to avoids unnecessary page cache invalidating for non-post_read inode. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 8 ++------ fs/f2fs/f2fs.h | 1 + fs/f2fs/segment.c | 9 ++++++++- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 89bc19684da0..208cd4c5c500 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1433,12 +1433,9 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, *map->m_next_extent = pgofs + map->m_len; /* for hardware encryption, but to avoid potential issue in future */ - if (flag == F2FS_GET_BLOCK_DIO) { + if (flag == F2FS_GET_BLOCK_DIO) f2fs_wait_on_block_writeback_range(inode, map->m_pblk, map->m_len); - invalidate_mapping_pages(META_MAPPING(sbi), - map->m_pblk, map->m_pblk + map->m_len - 1); - } if (map->m_multidev_dio) { block_t blk_addr = map->m_pblk; @@ -1654,8 +1651,6 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, */ f2fs_wait_on_block_writeback_range(inode, map->m_pblk, map->m_len); - invalidate_mapping_pages(META_MAPPING(sbi), - map->m_pblk, map->m_pblk + map->m_len - 1); if (map->m_multidev_dio) { block_t blk_addr = map->m_pblk; @@ -2788,6 +2783,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, .submitted = false, .compr_blocks = compr_blocks, .need_lock = LOCK_RETRY, + .post_read = f2fs_post_read_required(inode), .io_type = io_type, .io_wbc = wbc, .bio = bio, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d7dd609404db..bea816a8af8c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1197,6 +1197,7 @@ struct f2fs_io_info { bool retry; /* need to reallocate block address */ int compr_blocks; /* # of compressed block addresses */ bool encrypted; /* indicate file is encrypted */ + bool post_read; /* require post read */ enum iostat_type io_type; /* io type */ struct writeback_control *io_wbc; /* writeback control */ struct bio **bio; /* bio for ipu */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 504e267a8685..259475a8ba4a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3433,7 +3433,8 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) goto drop_bio; } - invalidate_mapping_pages(META_MAPPING(sbi), + if (fio->post_read) + invalidate_mapping_pages(META_MAPPING(sbi), fio->new_blkaddr, fio->new_blkaddr); stat_inc_inplace_blocks(fio->sbi); @@ -3616,10 +3617,16 @@ void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr) void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr, block_t len) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); block_t i; + if (!f2fs_post_read_required(inode)) + return; + for (i = 0; i < len; i++) f2fs_wait_on_block_writeback(inode, blkaddr + i); + + invalidate_mapping_pages(META_MAPPING(sbi), blkaddr, blkaddr + len - 1); } static int read_compacted_summaries(struct f2fs_sb_info *sbi) From dc10ba6c1805627ff4fa8179234350d4cbb5c282 Mon Sep 17 00:00:00 2001 From: qixiaoyu1 Date: Mon, 18 Jul 2022 11:28:40 +0800 Subject: [PATCH 454/580] f2fs: don't bother wait_ms by foreground gc f2fs_gc returns -EINVAL via f2fs_balance_fs when there is enough free secs after write checkpoint, but with gc_merge enabled, it will cause the sleep time of gc thread to be set to no_gc_sleep_time even if there are many dirty segments can be selected. Signed-off-by: qixiaoyu1 Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 626baec9dd1b..a5c0c835c99b 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -150,8 +150,11 @@ static int gc_thread_func(void *data) gc_control.nr_free_secs = foreground ? 1 : 0; /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, &gc_control)) - wait_ms = gc_th->no_gc_sleep_time; + if (f2fs_gc(sbi, &gc_control)) { + /* don't bother wait_ms by foreground gc */ + if (!foreground) + wait_ms = gc_th->no_gc_sleep_time; + } if (foreground) wake_up_all(&gc_th->fggc_wq); From affb55ba78e564606120935059289f012e652662 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Mon, 18 Jul 2022 16:02:48 -0700 Subject: [PATCH 455/580] f2fs: introduce sysfs atomic write statistics introduce the below 4 new sysfs node for atomic write statistics. - current_atomic_write: the total current atomic write block count, which is not committed yet. - peak_atomic_write: the peak value of total current atomic write block count after boot. - committed_atomic_block: the accumulated total committed atomic write block count after boot. - revoked_atomic_block: the accumulated total revoked atomic write block count after boot. Signed-off-by: Daeho Jeong Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 24 +++++++++++ fs/f2fs/data.c | 7 ++-- fs/f2fs/f2fs.h | 30 ++++++++++++++ fs/f2fs/file.c | 1 + fs/f2fs/segment.c | 6 +++ fs/f2fs/super.c | 1 + fs/f2fs/sysfs.c | 55 +++++++++++++++++++++++++ 7 files changed, 121 insertions(+), 3 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 582b71796474..8c7428f9d4dc 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -561,3 +561,27 @@ Date: January 2022 Contact: "Jaegeuk Kim" Description: Controls max # of node block writes to be used for roll forward recovery. This can limit the roll forward recovery time. + +What: /sys/fs/f2fs//current_atomic_write +Date: July 2022 +Contact: "Daeho Jeong" +Description: Show the total current atomic write block count, which is not committed yet. + This is a read-only entry. + +What: /sys/fs/f2fs//peak_atomic_write +Date: July 2022 +Contact: "Daeho Jeong" +Description: Show the peak value of total current atomic write block count after boot. + If you write "0" here, you can initialize to "0". + +What: /sys/fs/f2fs//committed_atomic_block +Date: July 2022 +Contact: "Daeho Jeong" +Description: Show the accumulated total committed atomic write block count after boot. + If you write "0" here, you can initialize to "0". + +What: /sys/fs/f2fs//revoked_atomic_block +Date: July 2022 +Contact: "Daeho Jeong" +Description: Show the accumulated total revoked atomic write block count after boot. + If you write "0" here, you can initialize to "0". diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 208cd4c5c500..ab87c6fc1281 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3463,12 +3463,11 @@ static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi, struct inode *cow_inode = F2FS_I(inode)->cow_inode; pgoff_t index = page->index; int err = 0; - block_t ori_blk_addr; + block_t ori_blk_addr = NULL_ADDR; /* If pos is beyond the end of file, reserve a new block in COW inode */ if ((pos & PAGE_MASK) >= i_size_read(inode)) - return __reserve_data_block(cow_inode, index, blk_addr, - node_changed); + goto reserve_block; /* Look for the block in COW inode first */ err = __find_data_block(cow_inode, index, blk_addr); @@ -3482,10 +3481,12 @@ static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi, if (err) return err; +reserve_block: /* Finally, we should reserve a new block in COW inode for the update */ err = __reserve_data_block(cow_inode, index, blk_addr, node_changed); if (err) return err; + inc_atomic_write_cnt(inode); if (ori_blk_addr != NULL_ADDR) *blk_addr = ori_blk_addr; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index bea816a8af8c..8958b40d63c9 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -811,6 +811,8 @@ struct f2fs_inode_info { unsigned char i_compress_level; /* compress level (lz4hc,zstd) */ unsigned short i_compress_flag; /* compress flag */ unsigned int i_cluster_size; /* cluster size */ + + unsigned int atomic_write_cnt; }; static inline void get_extent_info(struct extent_info *ext, @@ -1815,6 +1817,12 @@ struct f2fs_sb_info { int max_fragment_chunk; /* max chunk size for block fragmentation mode */ int max_fragment_hole; /* max hole size for block fragmentation mode */ + /* For atomic write statistics */ + atomic64_t current_atomic_write; + s64 peak_atomic_write; + u64 committed_atomic_block; + u64 revoked_atomic_block; + #ifdef CONFIG_F2FS_FS_COMPRESSION struct kmem_cache *page_array_slab; /* page array entry */ unsigned int page_array_slab_size; /* default page array slab size */ @@ -2437,6 +2445,28 @@ static inline void inode_dec_dirty_pages(struct inode *inode) dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_QDATA); } +static inline void inc_atomic_write_cnt(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + u64 current_write; + + fi->atomic_write_cnt++; + atomic64_inc(&sbi->current_atomic_write); + current_write = atomic64_read(&sbi->current_atomic_write); + if (current_write > sbi->peak_atomic_write) + sbi->peak_atomic_write = current_write; +} + +static inline void release_atomic_write_cnt(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + + atomic64_sub(fi->atomic_write_cnt, &sbi->current_atomic_write); + fi->atomic_write_cnt = 0; +} + static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type) { return atomic_read(&sbi->nr_pages[count_type]); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index e3bf943de823..f6430c9e5acb 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2145,6 +2145,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) f2fs_update_time(sbi, REQ_TIME); fi->atomic_write_task = current; stat_update_max_atomic_write(inode); + fi->atomic_write_cnt = 0; out: inode_unlock(inode); mnt_drop_write_file(filp); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 259475a8ba4a..8371c88a0026 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -195,6 +195,7 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean) clear_inode_flag(fi->cow_inode, FI_ATOMIC_FILE); iput(fi->cow_inode); fi->cow_inode = NULL; + release_atomic_write_cnt(inode); clear_inode_flag(inode, FI_ATOMIC_FILE); spin_lock(&sbi->inode_lock[ATOMIC_FILE]); @@ -334,6 +335,11 @@ static int __f2fs_commit_atomic_write(struct inode *inode) } out: + if (ret) + sbi->revoked_atomic_block += fi->atomic_write_cnt; + else + sbi->committed_atomic_block += fi->atomic_write_cnt; + __complete_revoke_list(inode, &revoke_list, ret ? true : false); return ret; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 0fae4b285971..13cb10c1ea74 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3575,6 +3575,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->max_fragment_chunk = DEF_FRAGMENT_SIZE; sbi->max_fragment_hole = DEF_FRAGMENT_SIZE; spin_lock_init(&sbi->gc_urgent_high_lock); + atomic64_set(&sbi->current_atomic_write, 0); sbi->dir_level = DEF_DIR_LEVEL; sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index f54d0d049d0e..2b514aa6186e 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -343,6 +343,25 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, sbi->gc_reclaimed_segs[sbi->gc_segment_mode]); } + if (!strcmp(a->attr.name, "current_atomic_write")) { + s64 current_write = atomic64_read(&sbi->current_atomic_write); + + return snprintf(buf, PAGE_SIZE, "%lld\n", + current_write); + } + + if (!strcmp(a->attr.name, "peak_atomic_write")) + return snprintf(buf, PAGE_SIZE, "%lld\n", + sbi->peak_atomic_write); + + if (!strcmp(a->attr.name, "committed_atomic_block")) + return snprintf(buf, PAGE_SIZE, "%llu\n", + sbi->committed_atomic_block); + + if (!strcmp(a->attr.name, "revoked_atomic_block")) + return snprintf(buf, PAGE_SIZE, "%llu\n", + sbi->revoked_atomic_block); + ui = (unsigned int *)(ptr + a->offset); return sprintf(buf, "%u\n", *ui); @@ -604,6 +623,27 @@ static ssize_t __sbi_store(struct f2fs_attr *a, return count; } + if (!strcmp(a->attr.name, "peak_atomic_write")) { + if (t != 0) + return -EINVAL; + sbi->peak_atomic_write = 0; + return count; + } + + if (!strcmp(a->attr.name, "committed_atomic_block")) { + if (t != 0) + return -EINVAL; + sbi->committed_atomic_block = 0; + return count; + } + + if (!strcmp(a->attr.name, "revoked_atomic_block")) { + if (t != 0) + return -EINVAL; + sbi->revoked_atomic_block = 0; + return count; + } + *ui = (unsigned int)t; return count; @@ -709,6 +749,11 @@ static struct f2fs_attr f2fs_attr_##_name = { \ .offset = _offset \ } +#define F2FS_RO_ATTR(struct_type, struct_name, name, elname) \ + F2FS_ATTR_OFFSET(struct_type, name, 0444, \ + f2fs_sbi_show, NULL, \ + offsetof(struct struct_name, elname)) + #define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \ F2FS_ATTR_OFFSET(struct_type, name, 0644, \ f2fs_sbi_show, f2fs_sbi_store, \ @@ -843,6 +888,12 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_reclaimed_segments, gc_reclaimed_segs); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_fragment_chunk, max_fragment_chunk); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_fragment_hole, max_fragment_hole); +/* For atomic write */ +F2FS_RO_ATTR(F2FS_SBI, f2fs_sb_info, current_atomic_write, current_atomic_write); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, peak_atomic_write, peak_atomic_write); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, committed_atomic_block, committed_atomic_block); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, revoked_atomic_block, revoked_atomic_block); + #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { ATTR_LIST(gc_urgent_sleep_time), @@ -928,6 +979,10 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(gc_reclaimed_segments), ATTR_LIST(max_fragment_chunk), ATTR_LIST(max_fragment_hole), + ATTR_LIST(current_atomic_write), + ATTR_LIST(peak_atomic_write), + ATTR_LIST(committed_atomic_block), + ATTR_LIST(revoked_atomic_block), NULL, }; From 305bdf2b6d93607d71c9c3244cb540f17e0cf634 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Mon, 25 Jul 2022 18:16:33 +0800 Subject: [PATCH 456/580] f2fs: fix to remove F2FS_COMPR_FL and tag F2FS_NOCOMP_FL at the same time If the inode has the compress flag, it will fail to use 'chattr -c +m' to remove its compress flag and tag no compress flag. However, the same command will be successful when executed again, as shown below: $ touch foo.txt $ chattr +c foo.txt $ chattr -c +m foo.txt chattr: Invalid argument while setting flags on foo.txt $ chattr -c +m foo.txt $ f2fs_io getflags foo.txt get a flag on foo.txt ret=0, flags=nocompression,inline_data Fix this by removing some checks in f2fs_setflags_common() that do not affect the original logic. I go through all the possible scenarios, and the results are as follows. Bold is the only thing that has changed. +---------------+-----------+-----------+----------+ | | file flags | + command +-----------+-----------+----------+ | | no flag | compr | nocompr | +---------------+-----------+-----------+----------+ | chattr +c | compr | compr | -EINVAL | | chattr -c | no flag | no flag | nocompr | | chattr +m | nocompr | -EINVAL | nocompr | | chattr -m | no flag | compr | no flag | | chattr +c +m | -EINVAL | -EINVAL | -EINVAL | | chattr +c -m | compr | compr | compr | | chattr -c +m | nocompr | *nocompr* | nocompr | | chattr -c -m | no flag | no flag | no flag | +---------------+-----------+-----------+----------+ Link: https://lore.kernel.org/linux-f2fs-devel/20220621064833.1079383-1-chaoliu719@gmail.com/ Fixes: 4c8ff7095bef ("f2fs: support data compression") Reviewed-by: Chao Yu Signed-off-by: Chao Liu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f6430c9e5acb..4ab2314aad3c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1886,10 +1886,7 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) if (masked_flags & F2FS_COMPR_FL) { if (!f2fs_disable_compressed_file(inode)) return -EINVAL; - } - if (iflags & F2FS_NOCOMP_FL) - return -EINVAL; - if (iflags & F2FS_COMPR_FL) { + } else { if (!f2fs_may_compress(inode)) return -EINVAL; if (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode)) @@ -1898,10 +1895,6 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) set_compress_context(inode); } } - if ((iflags ^ masked_flags) & F2FS_NOCOMP_FL) { - if (masked_flags & F2FS_COMPR_FL) - return -EINVAL; - } fi->i_flags = iflags | (fi->i_flags & ~mask); f2fs_bug_on(F2FS_I_SB(inode), (fi->i_flags & F2FS_COMPR_FL) && From bdc3a5e2d9ec4521c1b56c7087f5e136a9239dfc Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 25 Jul 2022 00:03:23 +0800 Subject: [PATCH 457/580] f2fs: fix to avoid use f2fs_bug_on() in f2fs_new_node_page() As Dipanjan Das reported, syzkaller found a f2fs bug as below: RIP: 0010:f2fs_new_node_page+0x19ac/0x1fc0 fs/f2fs/node.c:1295 Call Trace: write_all_xattrs fs/f2fs/xattr.c:487 [inline] __f2fs_setxattr+0xe76/0x2e10 fs/f2fs/xattr.c:743 f2fs_setxattr+0x233/0xab0 fs/f2fs/xattr.c:790 f2fs_xattr_generic_set+0x133/0x170 fs/f2fs/xattr.c:86 __vfs_setxattr+0x115/0x180 fs/xattr.c:182 __vfs_setxattr_noperm+0x125/0x5f0 fs/xattr.c:216 __vfs_setxattr_locked+0x1cf/0x260 fs/xattr.c:277 vfs_setxattr+0x13f/0x330 fs/xattr.c:303 setxattr+0x146/0x160 fs/xattr.c:611 path_setxattr+0x1a7/0x1d0 fs/xattr.c:630 __do_sys_lsetxattr fs/xattr.c:653 [inline] __se_sys_lsetxattr fs/xattr.c:649 [inline] __x64_sys_lsetxattr+0xbd/0x150 fs/xattr.c:649 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 NAT entry and nat bitmap can be inconsistent, e.g. one nid is free in nat bitmap, and blkaddr in its NAT entry is not NULL_ADDR, it may trigger BUG_ON() in f2fs_new_node_page(), fix it. Reported-by: Dipanjan Das Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 9cb66f35d191..69b011a461a8 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1290,7 +1290,11 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs) dec_valid_node_count(sbi, dn->inode, !ofs); goto fail; } - f2fs_bug_on(sbi, new_ni.blk_addr != NULL_ADDR); + if (unlikely(new_ni.blk_addr != NULL_ADDR)) { + err = -EFSCORRUPTED; + set_sbi_flag(sbi, SBI_NEED_FSCK); + goto fail; + } #endif new_ni.nid = dn->nid; new_ni.ino = dn->inode->i_ino; From b1ab178e7232e646039904671ed5ff63a2951c59 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 25 Jul 2022 00:04:17 +0800 Subject: [PATCH 458/580] f2fs: obsolete unused MAX_DISCARD_BLOCKS After commit a7eeb823854c ("f2fs: use bitmap in discard_entry"), MAX_DISCARD_BLOCKS became obsolete, remove it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8958b40d63c9..b9a1f7df7b6e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -226,7 +226,6 @@ enum { #define CP_PAUSE 0x00000040 #define CP_RESIZE 0x00000080 -#define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) #define DEF_MAX_DISCARD_REQUEST 8 /* issue 8 discards per round */ #define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */ #define DEF_MID_DISCARD_ISSUE_TIME 500 /* 500 ms, if device busy */ From a2464a3cba1e7a8fbd16083d4eac69e1c5ca6569 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 27 Jul 2022 22:51:05 +0800 Subject: [PATCH 459/580] f2fs: fix to do sanity check on segment type in build_sit_entries() As Wenqing Liu reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=216285 RIP: 0010:memcpy_erms+0x6/0x10 f2fs_update_meta_page+0x84/0x570 [f2fs] change_curseg.constprop.0+0x159/0xbd0 [f2fs] f2fs_do_replace_block+0x5c7/0x18a0 [f2fs] f2fs_replace_block+0xeb/0x180 [f2fs] recover_data+0x1abd/0x6f50 [f2fs] f2fs_recover_fsync_data+0x12ce/0x3250 [f2fs] f2fs_fill_super+0x4459/0x6190 [f2fs] mount_bdev+0x2cf/0x3b0 legacy_get_tree+0xed/0x1d0 vfs_get_tree+0x81/0x2b0 path_mount+0x47e/0x19d0 do_mount+0xce/0xf0 __x64_sys_mount+0x12c/0x1a0 do_syscall_64+0x38/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd The root cause is segment type is invalid, so in f2fs_do_replace_block(), f2fs accesses f2fs_sm_info::curseg_array with out-of-range segment type, result in accessing invalid curseg->sum_blk during memcpy in f2fs_update_meta_page(). Fix this by adding sanity check on segment type in build_sit_entries(). Reported-by: Wenqing Liu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 8371c88a0026..ae4ac0c32f25 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -4375,6 +4375,12 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) return err; seg_info_from_raw_sit(se, &sit); + if (se->type >= NR_PERSISTENT_LOG) { + f2fs_err(sbi, "Invalid segment type: %u, segno: %u", + se->type, start); + return -EFSCORRUPTED; + } + sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks; if (f2fs_block_unit_discard(sbi)) { @@ -4423,6 +4429,13 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) break; seg_info_from_raw_sit(se, &sit); + if (se->type >= NR_PERSISTENT_LOG) { + f2fs_err(sbi, "Invalid segment type: %u, segno: %u", + se->type, start); + err = -EFSCORRUPTED; + break; + } + sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks; if (f2fs_block_unit_discard(sbi)) { From 66e8e51c22cbbcdeb91365f2b1d1493bcdf44017 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 5 Jun 2019 08:04:49 -0700 Subject: [PATCH 460/580] vfs: introduce file_modified() helper The combination of file_remove_privs() and file_update_mtime() is quite common in filesystem ->write_iter() methods. Modelled after the helper file_accessed(), introduce file_modified() and use it from generic_remap_file_range_prep(). Note that the order of calling file_remove_privs() before file_update_mtime() in the helper was matched to the more common order by filesystems and not the current order in generic_remap_file_range_prep(). Signed-off-by: Amir Goldstein Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/inode.c | 20 ++++++++++++++++++++ include/linux/fs.h | 2 ++ 2 files changed, 22 insertions(+) diff --git a/fs/inode.c b/fs/inode.c index 961808880f43..89a0ffeaaadf 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1888,6 +1888,26 @@ int file_update_time(struct file *file) } EXPORT_SYMBOL(file_update_time); +/* Caller must hold the file's inode lock */ +int file_modified(struct file *file) +{ + int err; + + /* + * Clear the security bits if the process is not being run by root. + * This keeps people from modifying setuid and setgid binaries. + */ + err = file_remove_privs(file); + if (err) + return err; + + if (unlikely(file->f_mode & FMODE_NOCMTIME)) + return 0; + + return file_update_time(file); +} +EXPORT_SYMBOL(file_modified); + int inode_needs_sync(struct inode *inode) { if (IS_SYNC(inode)) diff --git a/include/linux/fs.h b/include/linux/fs.h index 78a70cb337ae..84b45a566538 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2138,6 +2138,8 @@ static inline void file_accessed(struct file *file) touch_atime(&file->f_path); } +extern int file_modified(struct file *file); + int sync_inode(struct inode *inode, struct writeback_control *wbc); int sync_inode_metadata(struct inode *inode, int wait); From 12504c2666badb1cb7476f8cffc0379cc212a668 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 17 May 2022 11:37:23 +0800 Subject: [PATCH 461/580] f2fs: fix fallocate to use file_modified to update permissions consistently This patch tries to fix permission consistency issue as all other mainline filesystems. Since the initial introduction of (posix) fallocate back at the turn of the century, it has been possible to use this syscall to change the user-visible contents of files. This can happen by extending the file size during a preallocation, or through any of the newer modes (punch, zero, collapse, insert range). Because the call can be used to change file contents, we should treat it like we do any other modification to a file -- update the mtime, and drop set[ug]id privileges/capabilities. The VFS function file_modified() does all this for us if pass it a locked inode, so let's make fallocate drop permissions correctly. Cc: stable@kernel.org Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 4ab2314aad3c..b8b92970801b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1797,6 +1797,10 @@ static long f2fs_fallocate(struct file *file, int mode, inode_lock(inode); + ret = file_modified(file); + if (ret) + goto out; + if (mode & FALLOC_FL_PUNCH_HOLE) { if (offset >= inode->i_size) goto out; From 4cbd7cbe6fc38c122e31a96a0bd35c48792bb3a7 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Mon, 1 Aug 2022 10:08:08 -0700 Subject: [PATCH 462/580] f2fs: revive F2FS_IOC_ABORT_VOLATILE_WRITE F2FS_IOC_ABORT_VOLATILE_WRITE was used to abort a atomic write before. However it was removed accidentally. So revive it by changing the name, since volatile write had gone. Signed-off-by: Daeho Jeong Fiexes: 7bc155fec5b3("f2fs: kill volatile write support") Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 29 +++++++++++++++++++++++++++-- include/uapi/linux/f2fs.h | 2 +- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b8b92970801b..05df6106170a 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2182,6 +2182,30 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) return ret; } +static int f2fs_ioc_abort_atomic_write(struct file *filp) +{ + struct inode *inode = file_inode(filp); + int ret; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + inode_lock(inode); + + if (f2fs_is_atomic_file(inode)) + f2fs_abort_atomic_write(inode, true); + + inode_unlock(inode); + + mnt_drop_write_file(filp); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + return ret; +} + static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -4202,9 +4226,10 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_start_atomic_write(filp); case F2FS_IOC_COMMIT_ATOMIC_WRITE: return f2fs_ioc_commit_atomic_write(filp); + case F2FS_IOC_ABORT_ATOMIC_WRITE: + return f2fs_ioc_abort_atomic_write(filp); case F2FS_IOC_START_VOLATILE_WRITE: case F2FS_IOC_RELEASE_VOLATILE_WRITE: - case F2FS_IOC_ABORT_VOLATILE_WRITE: return -EOPNOTSUPP; case F2FS_IOC_SHUTDOWN: return f2fs_ioc_shutdown(filp, arg); @@ -4595,7 +4620,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_COMMIT_ATOMIC_WRITE: case F2FS_IOC_START_VOLATILE_WRITE: case F2FS_IOC_RELEASE_VOLATILE_WRITE: - case F2FS_IOC_ABORT_VOLATILE_WRITE: + case F2FS_IOC_ABORT_ATOMIC_WRITE: case F2FS_IOC_SHUTDOWN: case FS_IOC_SET_ENCRYPTION_POLICY: case FS_IOC_GET_ENCRYPTION_PWSALT: diff --git a/include/uapi/linux/f2fs.h b/include/uapi/linux/f2fs.h index 352a822d4370..3121d127d5aa 100644 --- a/include/uapi/linux/f2fs.h +++ b/include/uapi/linux/f2fs.h @@ -13,7 +13,7 @@ #define F2FS_IOC_COMMIT_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 2) #define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3) #define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4) -#define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) +#define F2FS_IOC_ABORT_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 5) #define F2FS_IOC_GARBAGE_COLLECT _IOW(F2FS_IOCTL_MAGIC, 6, __u32) #define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7) #define F2FS_IOC_DEFRAGMENT _IOWR(F2FS_IOCTL_MAGIC, 8, \ From d50b0df496ec662e61a1cd3663c9a89d5081fd7b Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 1 Aug 2022 19:26:04 +0800 Subject: [PATCH 463/580] f2fs: fix null-ptr-deref in f2fs_get_dnode_of_data There is issue as follows when test f2fs atomic write: F2FS-fs (loop0): Can't find valid F2FS filesystem in 2th superblock F2FS-fs (loop0): invalid crc_offset: 0 F2FS-fs (loop0): f2fs_check_nid_range: out-of-range nid=1, run fsck to fix. F2FS-fs (loop0): f2fs_check_nid_range: out-of-range nid=2, run fsck to fix. ================================================================== BUG: KASAN: null-ptr-deref in f2fs_get_dnode_of_data+0xac/0x16d0 Read of size 8 at addr 0000000000000028 by task rep/1990 CPU: 4 PID: 1990 Comm: rep Not tainted 5.19.0-rc6-next-20220715 #266 Call Trace: dump_stack_lvl+0x6e/0x91 print_report.cold+0x49a/0x6bb kasan_report+0xa8/0x130 f2fs_get_dnode_of_data+0xac/0x16d0 f2fs_do_write_data_page+0x2a5/0x1030 move_data_page+0x3c5/0xdf0 do_garbage_collect+0x2015/0x36c0 f2fs_gc+0x554/0x1d30 f2fs_balance_fs+0x7f5/0xda0 f2fs_write_single_data_page+0xb66/0xdc0 f2fs_write_cache_pages+0x716/0x1420 f2fs_write_data_pages+0x84f/0x9a0 do_writepages+0x130/0x3a0 filemap_fdatawrite_wbc+0x87/0xa0 file_write_and_wait_range+0x157/0x1c0 f2fs_do_sync_file+0x206/0x12d0 f2fs_sync_file+0x99/0xc0 vfs_fsync_range+0x75/0x140 f2fs_file_write_iter+0xd7b/0x1850 vfs_write+0x645/0x780 ksys_write+0xf1/0x1e0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd As 3db1de0e582c commit changed atomic write way which new a cow_inode for atomic write file, and also mark cow_inode as FI_ATOMIC_FILE. When f2fs_do_write_data_page write cow_inode will use cow_inode's cow_inode which is NULL. Then will trigger null-ptr-deref. To solve above issue, introduce FI_COW_FILE flag for COW inode. Fiexes: 3db1de0e582c("f2fs: change the current atomic write way") Signed-off-by: Ye Bin Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 6 ++++++ fs/f2fs/file.c | 2 +- fs/f2fs/segment.c | 4 ++-- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b9a1f7df7b6e..aa5b6bcc3dcb 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -753,6 +753,7 @@ enum { FI_ENABLE_COMPRESS, /* enable compression in "user" compression mode */ FI_COMPRESS_RELEASED, /* compressed blocks were released */ FI_ALIGNED_WRITE, /* enable aligned write */ + FI_COW_FILE, /* indicate COW file */ FI_MAX, /* max flag, never be used */ }; @@ -3250,6 +3251,11 @@ static inline bool f2fs_is_atomic_file(struct inode *inode) return is_inode_flag_set(inode, FI_ATOMIC_FILE); } +static inline bool f2fs_is_cow_file(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_COW_FILE); +} + static inline bool f2fs_is_first_block_written(struct inode *inode) { return is_inode_flag_set(inode, FI_FIRST_BLOCK_WRITTEN); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 05df6106170a..958d4da93d10 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2135,7 +2135,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); set_inode_flag(inode, FI_ATOMIC_FILE); - set_inode_flag(fi->cow_inode, FI_ATOMIC_FILE); + set_inode_flag(fi->cow_inode, FI_COW_FILE); clear_inode_flag(fi->cow_inode, FI_INLINE_DATA); f2fs_up_write(&fi->i_gc_rwsem[WRITE]); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ae4ac0c32f25..82e1a77efaa0 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -192,7 +192,7 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean) if (f2fs_is_atomic_file(inode)) { if (clean) truncate_inode_pages_final(inode->i_mapping); - clear_inode_flag(fi->cow_inode, FI_ATOMIC_FILE); + clear_inode_flag(fi->cow_inode, FI_COW_FILE); iput(fi->cow_inode); fi->cow_inode = NULL; release_atomic_write_cnt(inode); @@ -3172,7 +3172,7 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) return CURSEG_COLD_DATA; if (file_is_hot(inode) || is_inode_flag_set(inode, FI_HOT_DATA) || - f2fs_is_atomic_file(inode)) + f2fs_is_cow_file(inode)) return CURSEG_HOT_DATA; return f2fs_rw_hint_to_seg_type(inode->i_write_hint); } else { From 28223e7b69437635c7d4f4db98b7cb1159bb0c4b Mon Sep 17 00:00:00 2001 From: Eunhee Rho Date: Mon, 1 Aug 2022 13:40:02 +0900 Subject: [PATCH 464/580] f2fs: remove device type check for direct IO To ensure serialized IOs, f2fs allows only LFS mode for zoned device. Remove redundant check for direct IO. Signed-off-by: Eunhee Rho Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index aa5b6bcc3dcb..b5eb01e4d95b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4557,12 +4557,7 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, /* disallow direct IO if any of devices has unaligned blksize */ if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize) return true; - /* - * for blkzoned device, fallback direct IO to buffered IO, so - * all IOs can be serialized by log-structured write. - */ - if (f2fs_sb_has_blkzoned(sbi)) - return true; + if (f2fs_lfs_mode(sbi) && (rw == WRITE)) { if (block_unaligned_IO(inode, iocb, iter)) return true; From 01a432966ab79dcb332d491f93455b6397e0aa70 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 3 Aug 2022 20:33:54 -0700 Subject: [PATCH 465/580] f2fs: do not set compression bit if kernel doesn't support If kernel doesn't have CONFIG_F2FS_FS_COMPRESSION, a file having FS_COMPR_FL via ioctl(FS_IOC_SETFLAGS) is unaccessible due to f2fs_is_compress_backend_ready(). Let's avoid it. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 7 ++++++- fs/f2fs/file.c | 4 ++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b5eb01e4d95b..6fb0239e1475 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4334,8 +4334,9 @@ static inline void f2fs_update_extent_tree_range_compressed(struct inode *inode, unsigned int c_len) { } #endif -static inline void set_compress_context(struct inode *inode) +static inline int set_compress_context(struct inode *inode) { +#ifdef CONFIG_F2FS_FS_COMPRESSION struct f2fs_sb_info *sbi = F2FS_I_SB(inode); F2FS_I(inode)->i_compress_algorithm = @@ -4358,6 +4359,10 @@ static inline void set_compress_context(struct inode *inode) stat_inc_compr_inode(inode); inc_compr_inode_stat(inode); f2fs_mark_inode_dirty_sync(inode, true); + return 0; +#else + return -EOPNOTSUPP; +#endif } static inline bool f2fs_disable_compressed_file(struct inode *inode) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 958d4da93d10..44721d0dae3d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1895,8 +1895,8 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) return -EINVAL; if (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode)) return -EINVAL; - - set_compress_context(inode); + if (set_compress_context(inode)) + return -EOPNOTSUPP; } } From 4c9ce016624ddd34b456c0f3b62c05cf155b7000 Mon Sep 17 00:00:00 2001 From: Jaewook Kim Date: Wed, 3 Aug 2022 17:53:58 +0900 Subject: [PATCH 466/580] f2fs: do not allow to decompress files have FI_COMPRESS_RELEASED If a file has FI_COMPRESS_RELEASED, all writes for it should not be allowed. However, as of now, in case of compress_mode=user, writes triggered by IOCTLs like F2FS_IOC_DE/COMPRESS_FILE are allowed unexpectly, which could crash that file. To fix it, let's do not allow F2FS_IOC_DE/COMPRESS_IOCTL if a file already has FI_COMPRESS_RELEASED flag. This is the reproduction process: 1. $ touch ./file 2. $ chattr +c ./file 3. $ dd if=/dev/random of=./file bs=4096 count=30 conv=notrunc 4. $ dd if=/dev/zero of=./file bs=4096 count=34 seek=30 conv=notrunc 5. $ sync 6. $ do_compress ./file ; call F2FS_IOC_COMPRESS_FILE 7. $ get_compr_blocks ./file ; call F2FS_IOC_GET_COMPRESS_BLOCKS 8. $ release ./file ; call F2FS_IOC_RELEASE_COMPRESS_BLOCKS 9. $ do_compress ./file ; call F2FS_IOC_COMPRESS_FILE again 10. $ get_compr_blocks ./file ; call F2FS_IOC_GET_COMPRESS_BLOCKS again This reproduction process is tested in 128kb cluster size. You can find compr_blocks has a negative value. Fixes: 5fdb322ff2c2b ("f2fs: add F2FS_IOC_DECOMPRESS_FILE and F2FS_IOC_COMPRESS_FILE") Signed-off-by: Junbeom Yeom Signed-off-by: Sungjong Seo Signed-off-by: Youngjin Gil Signed-off-by: Jaewook Kim Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 44721d0dae3d..29bbd1e50998 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4107,6 +4107,11 @@ static int f2fs_ioc_decompress_file(struct file *filp, unsigned long arg) goto out; } + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { + ret = -EINVAL; + goto out; + } + ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) goto out; @@ -4174,6 +4179,11 @@ static int f2fs_ioc_compress_file(struct file *filp, unsigned long arg) goto out; } + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { + ret = -EINVAL; + goto out; + } + ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) goto out; From acc8d7d0c99aafc311ddbc083bf690950f1243de Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Tue, 2 Aug 2022 12:24:37 -0700 Subject: [PATCH 467/580] f2fs: handle decompress only post processing in softirq Now decompression is being handled in workqueue and it makes read I/O latency non-deterministic, because of the non-deterministic scheduling nature of workqueues. So, I made it handled in softirq context only if possible, not in low memory devices, since this modification will maintain decompresion related memory a little longer. Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 203 ++++++++++++++++++++++++++++++--------------- fs/f2fs/data.c | 52 ++++++++---- fs/f2fs/f2fs.h | 17 ++-- 3 files changed, 179 insertions(+), 93 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index eac3d13f9580..970304cb83c3 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -699,14 +699,19 @@ static int f2fs_compress_pages(struct compress_ctx *cc) return ret; } -void f2fs_decompress_cluster(struct decompress_io_ctx *dic) +static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic, + bool pre_alloc); +static void f2fs_release_decomp_mem(struct decompress_io_ctx *dic, + bool bypass_destroy_callback, bool pre_alloc); + +void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) { struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); struct f2fs_inode_info *fi = F2FS_I(dic->inode); const struct f2fs_compress_ops *cops = f2fs_cops[fi->i_compress_algorithm]; + bool bypass_callback = false; int ret; - int i; trace_f2fs_decompress_pages_start(dic->inode, dic->cluster_idx, dic->cluster_size, fi->i_compress_algorithm); @@ -716,41 +721,10 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic) goto out_end_io; } - dic->tpages = page_array_alloc(dic->inode, dic->cluster_size); - if (!dic->tpages) { - ret = -ENOMEM; - goto out_end_io; - } - - for (i = 0; i < dic->cluster_size; i++) { - if (dic->rpages[i]) { - dic->tpages[i] = dic->rpages[i]; - continue; - } - - dic->tpages[i] = f2fs_compress_alloc_page(); - if (!dic->tpages[i]) { - ret = -ENOMEM; - goto out_end_io; - } - } - - if (cops->init_decompress_ctx) { - ret = cops->init_decompress_ctx(dic); - if (ret) - goto out_end_io; - } - - dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size); - if (!dic->rbuf) { - ret = -ENOMEM; - goto out_destroy_decompress_ctx; - } - - dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages); - if (!dic->cbuf) { - ret = -ENOMEM; - goto out_vunmap_rbuf; + ret = f2fs_prepare_decomp_mem(dic, false); + if (ret) { + bypass_callback = true; + goto out_release; } dic->clen = le32_to_cpu(dic->cbuf->clen); @@ -758,7 +732,7 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic) if (dic->clen > PAGE_SIZE * dic->nr_cpages - COMPRESS_HEADER_SIZE) { ret = -EFSCORRUPTED; - goto out_vunmap_cbuf; + goto out_release; } ret = cops->decompress_pages(dic); @@ -779,17 +753,13 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic) } } -out_vunmap_cbuf: - vm_unmap_ram(dic->cbuf, dic->nr_cpages); -out_vunmap_rbuf: - vm_unmap_ram(dic->rbuf, dic->cluster_size); -out_destroy_decompress_ctx: - if (cops->destroy_decompress_ctx) - cops->destroy_decompress_ctx(dic); +out_release: + f2fs_release_decomp_mem(dic, bypass_callback, false); + out_end_io: trace_f2fs_decompress_pages_end(dic->inode, dic->cluster_idx, dic->clen, ret); - f2fs_decompress_end_io(dic, ret); + f2fs_decompress_end_io(dic, ret, in_task); } /* @@ -799,7 +769,7 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic) * (or in the case of a failure, cleans up without actually decompressing). */ void f2fs_end_read_compressed_page(struct page *page, bool failed, - block_t blkaddr) + block_t blkaddr, bool in_task) { struct decompress_io_ctx *dic = (struct decompress_io_ctx *)page_private(page); @@ -809,12 +779,12 @@ void f2fs_end_read_compressed_page(struct page *page, bool failed, if (failed) WRITE_ONCE(dic->failed, true); - else if (blkaddr) + else if (blkaddr && in_task) f2fs_cache_compressed_page(sbi, page, dic->inode->i_ino, blkaddr); if (atomic_dec_and_test(&dic->remaining_pages)) - f2fs_decompress_cluster(dic); + f2fs_decompress_cluster(dic, in_task); } static bool is_page_in_cluster(struct compress_ctx *cc, pgoff_t index) @@ -1522,16 +1492,85 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, return err; } -static void f2fs_free_dic(struct decompress_io_ctx *dic); +static inline bool allow_memalloc_for_decomp(struct f2fs_sb_info *sbi, + bool pre_alloc) +{ + return pre_alloc ^ f2fs_low_mem_mode(sbi); +} + +static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic, + bool pre_alloc) +{ + const struct f2fs_compress_ops *cops = + f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm]; + int i; + + if (!allow_memalloc_for_decomp(F2FS_I_SB(dic->inode), pre_alloc)) + return 0; + + dic->tpages = page_array_alloc(dic->inode, dic->cluster_size); + if (!dic->tpages) + return -ENOMEM; + + for (i = 0; i < dic->cluster_size; i++) { + if (dic->rpages[i]) { + dic->tpages[i] = dic->rpages[i]; + continue; + } + + dic->tpages[i] = f2fs_compress_alloc_page(); + if (!dic->tpages[i]) + return -ENOMEM; + } + + dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size); + if (!dic->rbuf) + return -ENOMEM; + + dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages); + if (!dic->cbuf) + return -ENOMEM; + + if (cops->init_decompress_ctx) { + int ret = cops->init_decompress_ctx(dic); + + if (ret) + return ret; + } + + return 0; +} + +static void f2fs_release_decomp_mem(struct decompress_io_ctx *dic, + bool bypass_destroy_callback, bool pre_alloc) +{ + const struct f2fs_compress_ops *cops = + f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm]; + + if (!allow_memalloc_for_decomp(F2FS_I_SB(dic->inode), pre_alloc)) + return; + + if (!bypass_destroy_callback && cops->destroy_decompress_ctx) + cops->destroy_decompress_ctx(dic); + + if (dic->cbuf) + vm_unmap_ram(dic->cbuf, dic->nr_cpages); + + if (dic->rbuf) + vm_unmap_ram(dic->rbuf, dic->cluster_size); +} + +static void f2fs_free_dic(struct decompress_io_ctx *dic, + bool bypass_destroy_callback); struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) { struct decompress_io_ctx *dic; pgoff_t start_idx = start_idx_of_cluster(cc); - int i; + struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); + int i, ret; - dic = f2fs_kmem_cache_alloc(dic_entry_slab, GFP_F2FS_ZERO, - false, F2FS_I_SB(cc->inode)); + dic = f2fs_kmem_cache_alloc(dic_entry_slab, GFP_F2FS_ZERO, false, sbi); if (!dic) return ERR_PTR(-ENOMEM); @@ -1557,32 +1596,43 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) dic->nr_rpages = cc->cluster_size; dic->cpages = page_array_alloc(dic->inode, dic->nr_cpages); - if (!dic->cpages) + if (!dic->cpages) { + ret = -ENOMEM; goto out_free; + } for (i = 0; i < dic->nr_cpages; i++) { struct page *page; page = f2fs_compress_alloc_page(); - if (!page) + if (!page) { + ret = -ENOMEM; goto out_free; + } f2fs_set_compressed_page(page, cc->inode, start_idx + i + 1, dic); dic->cpages[i] = page; } + ret = f2fs_prepare_decomp_mem(dic, true); + if (ret) + goto out_free; + return dic; out_free: - f2fs_free_dic(dic); - return ERR_PTR(-ENOMEM); + f2fs_free_dic(dic, true); + return ERR_PTR(ret); } -static void f2fs_free_dic(struct decompress_io_ctx *dic) +static void f2fs_free_dic(struct decompress_io_ctx *dic, + bool bypass_destroy_callback) { int i; + f2fs_release_decomp_mem(dic, bypass_destroy_callback, true); + if (dic->tpages) { for (i = 0; i < dic->cluster_size; i++) { if (dic->rpages[i]) @@ -1607,17 +1657,33 @@ static void f2fs_free_dic(struct decompress_io_ctx *dic) kmem_cache_free(dic_entry_slab, dic); } -static void f2fs_put_dic(struct decompress_io_ctx *dic) +static void f2fs_late_free_dic(struct work_struct *work) { - if (refcount_dec_and_test(&dic->refcnt)) - f2fs_free_dic(dic); + struct decompress_io_ctx *dic = + container_of(work, struct decompress_io_ctx, free_work); + + f2fs_free_dic(dic, false); +} + +static void f2fs_put_dic(struct decompress_io_ctx *dic, bool in_task) +{ + if (refcount_dec_and_test(&dic->refcnt)) { + if (in_task) { + f2fs_free_dic(dic, false); + } else { + INIT_WORK(&dic->free_work, f2fs_late_free_dic); + queue_work(F2FS_I_SB(dic->inode)->post_read_wq, + &dic->free_work); + } + } } /* * Update and unlock the cluster's pagecache pages, and release the reference to * the decompress_io_ctx that was being held for I/O completion. */ -static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed) +static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, + bool in_task) { int i; @@ -1638,7 +1704,7 @@ static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed) unlock_page(rpage); } - f2fs_put_dic(dic); + f2fs_put_dic(dic, in_task); } static void f2fs_verify_cluster(struct work_struct *work) @@ -1655,14 +1721,15 @@ static void f2fs_verify_cluster(struct work_struct *work) SetPageError(rpage); } - __f2fs_decompress_end_io(dic, false); + __f2fs_decompress_end_io(dic, false, true); } /* * This is called when a compressed cluster has been decompressed * (or failed to be read and/or decompressed). */ -void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed) +void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, + bool in_task) { if (!failed && dic->need_verity) { /* @@ -1674,7 +1741,7 @@ void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed) INIT_WORK(&dic->verity_work, f2fs_verify_cluster); fsverity_enqueue_verify_work(&dic->verity_work); } else { - __f2fs_decompress_end_io(dic, failed); + __f2fs_decompress_end_io(dic, failed, in_task); } } @@ -1683,12 +1750,12 @@ void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed) * * This is called when the page is no longer needed and can be freed. */ -void f2fs_put_page_dic(struct page *page) +void f2fs_put_page_dic(struct page *page, bool in_task) { struct decompress_io_ctx *dic = (struct decompress_io_ctx *)page_private(page); - f2fs_put_dic(dic); + f2fs_put_dic(dic, in_task); } /* diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ab87c6fc1281..8cec5ff93118 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -117,7 +117,7 @@ struct bio_post_read_ctx { block_t fs_blkaddr; }; -static void f2fs_finish_read_bio(struct bio *bio) +static void f2fs_finish_read_bio(struct bio *bio, bool in_task) { struct bio_vec *bv; int iter_all; @@ -131,8 +131,9 @@ static void f2fs_finish_read_bio(struct bio *bio) if (f2fs_is_compressed_page(page)) { if (bio->bi_status) - f2fs_end_read_compressed_page(page, true, 0); - f2fs_put_page_dic(page); + f2fs_end_read_compressed_page(page, true, 0, + in_task); + f2fs_put_page_dic(page, in_task); continue; } @@ -189,7 +190,7 @@ static void f2fs_verify_bio(struct work_struct *work) fsverity_verify_bio(bio); } - f2fs_finish_read_bio(bio); + f2fs_finish_read_bio(bio, true); } /* @@ -201,7 +202,7 @@ static void f2fs_verify_bio(struct work_struct *work) * can involve reading verity metadata pages from the file, and these verity * metadata pages may be encrypted and/or compressed. */ -static void f2fs_verify_and_finish_bio(struct bio *bio) +static void f2fs_verify_and_finish_bio(struct bio *bio, bool in_task) { struct bio_post_read_ctx *ctx = bio->bi_private; @@ -209,7 +210,7 @@ static void f2fs_verify_and_finish_bio(struct bio *bio) INIT_WORK(&ctx->work, f2fs_verify_bio); fsverity_enqueue_verify_work(&ctx->work); } else { - f2fs_finish_read_bio(bio); + f2fs_finish_read_bio(bio, in_task); } } @@ -222,7 +223,8 @@ static void f2fs_verify_and_finish_bio(struct bio *bio) * that the bio includes at least one compressed page. The actual decompression * is done on a per-cluster basis, not a per-bio basis. */ -static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx) +static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx, + bool in_task) { struct bio_vec *bv; int iter_all; @@ -235,7 +237,7 @@ static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx) /* PG_error was set if decryption failed. */ if (f2fs_is_compressed_page(page)) f2fs_end_read_compressed_page(page, PageError(page), - blkaddr); + blkaddr, in_task); else all_compressed = false; @@ -260,15 +262,16 @@ static void f2fs_post_read_work(struct work_struct *work) fscrypt_decrypt_bio(ctx->bio); if (ctx->enabled_steps & STEP_DECOMPRESS) - f2fs_handle_step_decompress(ctx); + f2fs_handle_step_decompress(ctx, true); - f2fs_verify_and_finish_bio(ctx->bio); + f2fs_verify_and_finish_bio(ctx->bio, true); } static void f2fs_read_end_io(struct bio *bio) { struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio)); struct bio_post_read_ctx *ctx; + bool intask = in_task(); iostat_update_and_unbind_ctx(bio, 0); ctx = bio->bi_private; @@ -279,16 +282,29 @@ static void f2fs_read_end_io(struct bio *bio) } if (bio->bi_status) { - f2fs_finish_read_bio(bio); + f2fs_finish_read_bio(bio, intask); return; } - if (ctx && (ctx->enabled_steps & (STEP_DECRYPT | STEP_DECOMPRESS))) { - INIT_WORK(&ctx->work, f2fs_post_read_work); - queue_work(ctx->sbi->post_read_wq, &ctx->work); - } else { - f2fs_verify_and_finish_bio(bio); + if (ctx) { + unsigned int enabled_steps = ctx->enabled_steps & + (STEP_DECRYPT | STEP_DECOMPRESS); + + /* + * If we have only decompression step between decompression and + * decrypt, we don't need post processing for this. + */ + if (enabled_steps == STEP_DECOMPRESS && + !f2fs_low_mem_mode(sbi)) { + f2fs_handle_step_decompress(ctx, intask); + } else if (enabled_steps) { + INIT_WORK(&ctx->work, f2fs_post_read_work); + queue_work(ctx->sbi->post_read_wq, &ctx->work); + return; + } } + + f2fs_verify_and_finish_bio(bio, intask); } static void f2fs_write_end_io(struct bio *bio) @@ -2245,7 +2261,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, if (f2fs_load_compressed_page(sbi, page, blkaddr)) { if (atomic_dec_and_test(&dic->remaining_pages)) - f2fs_decompress_cluster(dic); + f2fs_decompress_cluster(dic, true); continue; } @@ -2262,7 +2278,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, page->index, for_write); if (IS_ERR(bio)) { ret = PTR_ERR(bio); - f2fs_decompress_end_io(dic, ret); + f2fs_decompress_end_io(dic, ret, true); f2fs_put_dnode(&dn); *bio_ret = NULL; return ret; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6fb0239e1475..cba4ae038421 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1594,6 +1594,7 @@ struct decompress_io_ctx { void *private; /* payload buffer for specified decompression algorithm */ void *private2; /* extra payload buffer */ struct work_struct verity_work; /* work to verify the decompressed pages */ + struct work_struct free_work; /* work for late free this structure itself */ }; #define NULL_CLUSTER ((unsigned int)(~0)) @@ -4234,9 +4235,9 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page); bool f2fs_is_compress_backend_ready(struct inode *inode); int f2fs_init_compress_mempool(void); void f2fs_destroy_compress_mempool(void); -void f2fs_decompress_cluster(struct decompress_io_ctx *dic); +void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task); void f2fs_end_read_compressed_page(struct page *page, bool failed, - block_t blkaddr); + block_t blkaddr, bool in_task); bool f2fs_cluster_is_empty(struct compress_ctx *cc); bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index); bool f2fs_all_cluster_page_loaded(struct compress_ctx *cc, struct pagevec *pvec, @@ -4255,8 +4256,9 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, unsigned nr_pages, sector_t *last_block_in_bio, bool is_readahead, bool for_write); struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc); -void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed); -void f2fs_put_page_dic(struct page *page); +void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, + bool in_task); +void f2fs_put_page_dic(struct page *page, bool in_task); unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn); int f2fs_init_compress_ctx(struct compress_ctx *cc); void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse); @@ -4302,13 +4304,14 @@ static inline struct page *f2fs_compress_control_page(struct page *page) } static inline int f2fs_init_compress_mempool(void) { return 0; } static inline void f2fs_destroy_compress_mempool(void) { } -static inline void f2fs_decompress_cluster(struct decompress_io_ctx *dic) { } +static inline void f2fs_decompress_cluster(struct decompress_io_ctx *dic, + bool in_task) { } static inline void f2fs_end_read_compressed_page(struct page *page, - bool failed, block_t blkaddr) + bool failed, block_t blkaddr, bool in_task) { WARN_ON_ONCE(1); } -static inline void f2fs_put_page_dic(struct page *page) +static inline void f2fs_put_page_dic(struct page *page, bool in_task) { WARN_ON_ONCE(1); } From 8d84ee1f6733bf76f8f7cd67f279e7ed2b94836c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 4 Aug 2022 21:38:21 +0800 Subject: [PATCH 468/580] f2fs: clean up f2fs_abort_atomic_write() f2fs_abort_atomic_write() has checked whether current inode is atomic_write one or not, it's redundant to check in its caller, remove it for cleanup. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 9 +++------ fs/f2fs/inode.c | 3 +-- fs/f2fs/segment.c | 25 +++++++++++++------------ fs/f2fs/super.c | 3 +-- 4 files changed, 18 insertions(+), 22 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 29bbd1e50998..c299b98f8d93 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1839,8 +1839,7 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) atomic_read(&inode->i_writecount) != 1) return 0; - if (f2fs_is_atomic_file(inode)) - f2fs_abort_atomic_write(inode, true); + f2fs_abort_atomic_write(inode, true); return 0; } @@ -1854,8 +1853,7 @@ static int f2fs_file_flush(struct file *file, fl_owner_t id) * until all the writers close its file. Since this should be done * before dropping file lock, it needs to do in ->flush. */ - if (f2fs_is_atomic_file(inode) && - F2FS_I(inode)->atomic_write_task == current) + if (F2FS_I(inode)->atomic_write_task == current) f2fs_abort_atomic_write(inode, true); return 0; } @@ -2196,8 +2194,7 @@ static int f2fs_ioc_abort_atomic_write(struct file *filp) inode_lock(inode); - if (f2fs_is_atomic_file(inode)) - f2fs_abort_atomic_write(inode, true); + f2fs_abort_atomic_write(inode, true); inode_unlock(inode); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 52ac0122f62a..615c51a2ff2b 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -744,8 +744,7 @@ void f2fs_evict_inode(struct inode *inode) nid_t xnid = F2FS_I(inode)->i_xattr_nid; int err = 0; - if (f2fs_is_atomic_file(inode)) - f2fs_abort_atomic_write(inode, true); + f2fs_abort_atomic_write(inode, true); trace_f2fs_evict_inode(inode); truncate_inode_pages_final(&inode->i_data); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 82e1a77efaa0..8ed985685d2e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -189,19 +189,20 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); - if (f2fs_is_atomic_file(inode)) { - if (clean) - truncate_inode_pages_final(inode->i_mapping); - clear_inode_flag(fi->cow_inode, FI_COW_FILE); - iput(fi->cow_inode); - fi->cow_inode = NULL; - release_atomic_write_cnt(inode); - clear_inode_flag(inode, FI_ATOMIC_FILE); + if (!f2fs_is_atomic_file(inode)) + return; - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - sbi->atomic_files--; - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - } + if (clean) + truncate_inode_pages_final(inode->i_mapping); + clear_inode_flag(fi->cow_inode, FI_COW_FILE); + iput(fi->cow_inode); + fi->cow_inode = NULL; + release_atomic_write_cnt(inode); + clear_inode_flag(inode, FI_ATOMIC_FILE); + + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + sbi->atomic_files--; + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); } static int __replace_atomic_write_block(struct inode *inode, pgoff_t index, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 13cb10c1ea74..445244910730 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1386,8 +1386,7 @@ static int f2fs_drop_inode(struct inode *inode) atomic_inc(&inode->i_count); spin_unlock(&inode->i_lock); - if (f2fs_is_atomic_file(inode)) - f2fs_abort_atomic_write(inode, true); + f2fs_abort_atomic_write(inode, true); /* should remain fi->extent_tree for writepage */ f2fs_destroy_extent_node(inode); From 798b68d6644855055259f2247db4db76b3506264 Mon Sep 17 00:00:00 2001 From: Fengnan Chang Date: Sun, 31 Jul 2022 11:33:45 +0800 Subject: [PATCH 469/580] f2fs: intorduce f2fs_all_cluster_page_ready When write total cluster, all pages is uptodate, there is not need to call f2fs_prepare_compress_overwrite, intorduce f2fs_all_cluster_page_ready to avoid this. Signed-off-by: Fengnan Chang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 21 ++++++++++++++------- fs/f2fs/data.c | 8 ++++++-- fs/f2fs/f2fs.h | 4 ++-- 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 970304cb83c3..a423258b0188 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -811,20 +811,27 @@ bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index) return is_page_in_cluster(cc, index); } -bool f2fs_all_cluster_page_loaded(struct compress_ctx *cc, struct pagevec *pvec, - int index, int nr_pages) +bool f2fs_all_cluster_page_ready(struct compress_ctx *cc, struct pagevec *pvec, + int index, int nr_pages, bool uptodate) { - unsigned long pgidx; - int i; + unsigned long pgidx = pvec->pages[index]->index; + int i = uptodate ? 0 : 1; + + /* + * when uptodate set to true, try to check all pages in cluster is + * uptodate or not. + */ + if (uptodate && (pgidx % cc->cluster_size)) + return false; if (nr_pages - index < cc->cluster_size) return false; - pgidx = pvec->pages[index]->index; - - for (i = 1; i < cc->cluster_size; i++) { + for (; i < cc->cluster_size; i++) { if (pvec->pages[index + i]->index != pgidx + i) return false; + if (uptodate && !PageUptodate(pvec->pages[index + i])) + return false; } return true; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8cec5ff93118..32ed8cfd4793 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3068,6 +3068,10 @@ static int f2fs_write_cache_pages(struct address_space *mapping, if (!f2fs_cluster_is_empty(&cc)) goto lock_page; + if (f2fs_all_cluster_page_ready(&cc, + &pvec, i, nr_pages, true)) + goto lock_page; + ret2 = f2fs_prepare_compress_overwrite( inode, &pagep, page->index, &fsdata); @@ -3078,8 +3082,8 @@ static int f2fs_write_cache_pages(struct address_space *mapping, } else if (ret2 && (!f2fs_compress_write_end(inode, fsdata, page->index, 1) || - !f2fs_all_cluster_page_loaded(&cc, - &pvec, i, nr_pages))) { + !f2fs_all_cluster_page_ready(&cc, + &pvec, i, nr_pages, false))) { retry = 1; break; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index cba4ae038421..181011d9e9de 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4240,8 +4240,8 @@ void f2fs_end_read_compressed_page(struct page *page, bool failed, block_t blkaddr, bool in_task); bool f2fs_cluster_is_empty(struct compress_ctx *cc); bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index); -bool f2fs_all_cluster_page_loaded(struct compress_ctx *cc, struct pagevec *pvec, - int index, int nr_pages); +bool f2fs_all_cluster_page_ready(struct compress_ctx *cc, struct pagevec *pvec, + int index, int nr_pages, bool uptodate); bool f2fs_sanity_check_cluster(struct dnode_of_data *dn); void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page); int f2fs_write_multi_pages(struct compress_ctx *cc, From 8bddeeb721ca83c6086ff47894a88f12467f92fe Mon Sep 17 00:00:00 2001 From: Fengnan Chang Date: Sun, 31 Jul 2022 11:33:46 +0800 Subject: [PATCH 470/580] f2fs: use onstack pages instead of pvec Since pvec have 15 pages, it not a multiple of 4, when write compressed pages, write in 64K as a unit, it will call pagevec_lookup_range_tag agagin, sometimes this will take a lot of time. Use onstack pages instead of pvec to mitigate this problem. Signed-off-by: Fengnan Chang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 8 ++++---- fs/f2fs/data.c | 16 +++++++--------- fs/f2fs/f2fs.h | 4 +++- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index a423258b0188..466b0e1db95e 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -811,10 +811,10 @@ bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index) return is_page_in_cluster(cc, index); } -bool f2fs_all_cluster_page_ready(struct compress_ctx *cc, struct pagevec *pvec, +bool f2fs_all_cluster_page_ready(struct compress_ctx *cc, struct page **pages, int index, int nr_pages, bool uptodate) { - unsigned long pgidx = pvec->pages[index]->index; + unsigned long pgidx = pages[index]->index; int i = uptodate ? 0 : 1; /* @@ -828,9 +828,9 @@ bool f2fs_all_cluster_page_ready(struct compress_ctx *cc, struct pagevec *pvec, return false; for (; i < cc->cluster_size; i++) { - if (pvec->pages[index + i]->index != pgidx + i) + if (pages[index + i]->index != pgidx + i) return false; - if (uptodate && !PageUptodate(pvec->pages[index + i])) + if (uptodate && !PageUptodate(pages[index + i])) return false; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 32ed8cfd4793..80604b818ada 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2971,7 +2971,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, { int ret = 0; int done = 0, retry = 0; - struct pagevec pvec; + struct page *pages[F2FS_ONSTACK_PAGES]; struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); struct bio *bio = NULL; sector_t last_block; @@ -3003,8 +3003,6 @@ static int f2fs_write_cache_pages(struct address_space *mapping, int submitted = 0; int i; - pagevec_init(&pvec); - if (get_dirty_pages(mapping->host) <= SM_I(F2FS_M_SB(mapping))->min_hot_blocks) set_inode_flag(mapping->host, FI_HOT_DATA); @@ -3031,13 +3029,13 @@ static int f2fs_write_cache_pages(struct address_space *mapping, tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && !retry && (index <= end)) { - nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag); + nr_pages = find_get_pages_range_tag(mapping, &index, end, + tag, F2FS_ONSTACK_PAGES, pages); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + struct page *page = pages[i]; bool need_readd; readd: need_readd = false; @@ -3069,7 +3067,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, goto lock_page; if (f2fs_all_cluster_page_ready(&cc, - &pvec, i, nr_pages, true)) + pages, i, nr_pages, true)) goto lock_page; ret2 = f2fs_prepare_compress_overwrite( @@ -3083,7 +3081,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, (!f2fs_compress_write_end(inode, fsdata, page->index, 1) || !f2fs_all_cluster_page_ready(&cc, - &pvec, i, nr_pages, false))) { + pages, i, nr_pages, false))) { retry = 1; break; } @@ -3173,7 +3171,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, if (need_readd) goto readd; } - pagevec_release(&pvec); + release_pages(pages, nr_pages); cond_resched(); } #ifdef CONFIG_F2FS_FS_COMPRESSION diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 181011d9e9de..1fdf53cc137c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -594,6 +594,8 @@ enum { #define RECOVERY_MAX_RA_BLOCKS BIO_MAX_PAGES #define RECOVERY_MIN_RA_BLOCKS 1 +#define F2FS_ONSTACK_PAGES 16 /* nr of onstack pages */ + struct rb_entry { struct rb_node rb_node; /* rb node located in rb-tree */ union { @@ -4240,7 +4242,7 @@ void f2fs_end_read_compressed_page(struct page *page, bool failed, block_t blkaddr, bool in_task); bool f2fs_cluster_is_empty(struct compress_ctx *cc); bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index); -bool f2fs_all_cluster_page_ready(struct compress_ctx *cc, struct pagevec *pvec, +bool f2fs_all_cluster_page_ready(struct compress_ctx *cc, struct page **pages, int index, int nr_pages, bool uptodate); bool f2fs_sanity_check_cluster(struct dnode_of_data *dn); void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page); From aad438f209fa046884c711487ae9d0da510b6ee1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 11 Aug 2022 15:53:34 -0700 Subject: [PATCH 471/580] f2fs: LFS mode does not support ATGC ATGC is using SSR which violates LFS mode used by zoned device. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 445244910730..5608beb34c5c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1318,6 +1318,11 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) return -EINVAL; } + if (test_opt(sbi, ATGC) && f2fs_lfs_mode(sbi)) { + f2fs_err(sbi, "LFS not compatible with ATGC"); + return -EINVAL; + } + if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) { f2fs_err(sbi, "Allow to mount readonly mode only"); return -EROFS; From 3c75f068497841686adf0b18e36dec36625d989f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 12 Aug 2022 22:49:50 -0700 Subject: [PATCH 472/580] f2fs: fix wrong continue condition in GC We should decrease the frozen counter. Cc: stable@vger.kernel.org Fixes: 325163e9892b ("f2fs: add gc_urgent_high_remaining sysfs node") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index a5c0c835c99b..c10e644e7db8 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -97,14 +97,10 @@ static int gc_thread_func(void *data) */ if (sbi->gc_mode == GC_URGENT_HIGH) { spin_lock(&sbi->gc_urgent_high_lock); - if (sbi->gc_urgent_high_limited) { - if (!sbi->gc_urgent_high_remaining) { - sbi->gc_urgent_high_limited = false; - spin_unlock(&sbi->gc_urgent_high_lock); - sbi->gc_mode = GC_NORMAL; - continue; - } - sbi->gc_urgent_high_remaining--; + if (sbi->gc_urgent_high_limited && + !sbi->gc_urgent_high_remaining--) { + sbi->gc_urgent_high_limited = false; + sbi->gc_mode = GC_NORMAL; } spin_unlock(&sbi->gc_urgent_high_lock); } From df9a5e701ccebba715bbb4c2a5eaa99de16f0738 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 20 Aug 2022 11:06:00 +0800 Subject: [PATCH 473/580] f2fs: remove gc_urgent_high_limited for cleanup Remove redundant sbi->gc_urgent_high_limited. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 - fs/f2fs/gc.c | 8 ++++---- fs/f2fs/sysfs.c | 1 - 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1fdf53cc137c..39768ec56976 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1730,7 +1730,6 @@ struct f2fs_sb_info { unsigned int gc_mode; /* current GC state */ unsigned int next_victim_seg[2]; /* next segment in victim section */ spinlock_t gc_urgent_high_lock; - bool gc_urgent_high_limited; /* indicates having limited trial count */ unsigned int gc_urgent_high_remaining; /* remaining trial count for GC_URGENT_HIGH */ /* for skip statistic */ diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index c10e644e7db8..faec6af53b21 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -97,10 +97,10 @@ static int gc_thread_func(void *data) */ if (sbi->gc_mode == GC_URGENT_HIGH) { spin_lock(&sbi->gc_urgent_high_lock); - if (sbi->gc_urgent_high_limited && - !sbi->gc_urgent_high_remaining--) { - sbi->gc_urgent_high_limited = false; - sbi->gc_mode = GC_NORMAL; + if (sbi->gc_urgent_high_remaining) { + sbi->gc_urgent_high_remaining--; + if (!sbi->gc_urgent_high_remaining) + sbi->gc_mode = GC_NORMAL; } spin_unlock(&sbi->gc_urgent_high_lock); } diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 2b514aa6186e..18ce23dd17e4 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -535,7 +535,6 @@ static ssize_t __sbi_store(struct f2fs_attr *a, if (!strcmp(a->attr.name, "gc_urgent_high_remaining")) { spin_lock(&sbi->gc_urgent_high_lock); - sbi->gc_urgent_high_limited = t != 0; sbi->gc_urgent_high_remaining = t; spin_unlock(&sbi->gc_urgent_high_lock); From 05677a317ed79e19e9dccfe5b21d831db178ff3c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 19 Aug 2022 15:52:02 -0700 Subject: [PATCH 474/580] f2fs: flush pending checkpoints when freezing super This avoids -EINVAL when trying to freeze f2fs. Cc: stable@vger.kernel.org Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 24 ++++++++++++++++++------ fs/f2fs/f2fs.h | 1 + fs/f2fs/super.c | 5 ++--- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f837072d1596..bca519378778 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1893,15 +1893,27 @@ int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi) void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi) { struct ckpt_req_control *cprc = &sbi->cprc_info; + struct task_struct *ckpt_task; - if (cprc->f2fs_issue_ckpt) { - struct task_struct *ckpt_task = cprc->f2fs_issue_ckpt; + if (!cprc->f2fs_issue_ckpt) + return; - cprc->f2fs_issue_ckpt = NULL; - kthread_stop(ckpt_task); + ckpt_task = cprc->f2fs_issue_ckpt; + cprc->f2fs_issue_ckpt = NULL; + kthread_stop(ckpt_task); - flush_remained_ckpt_reqs(sbi, NULL); - } + f2fs_flush_ckpt_thread(sbi); +} + +void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + + flush_remained_ckpt_reqs(sbi, NULL); + + /* Let's wait for the previous dispatched checkpoint. */ + while (atomic_read(&cprc->queued_ckpt)) + io_schedule_timeout(DEFAULT_IO_TIMEOUT); } void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 39768ec56976..dc66a85d9167 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3749,6 +3749,7 @@ static inline bool f2fs_need_rand_seg(struct f2fs_sb_info *sbi) * checkpoint.c */ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io); +void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi); struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); struct page *f2fs_get_meta_page_retry(struct f2fs_sb_info *sbi, pgoff_t index); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 5608beb34c5c..31f05d39c35d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1657,9 +1657,8 @@ static int f2fs_freeze(struct super_block *sb) if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY)) return -EINVAL; - /* ensure no checkpoint required */ - if (!llist_empty(&F2FS_SB(sb)->cprc_info.issue_list)) - return -EINVAL; + /* Let's flush checkpoints and stop the thread. */ + f2fs_flush_ckpt_thread(F2FS_SB(sb)); /* to avoid deadlock on f2fs_evict_inode->SB_FREEZE_FS */ set_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING); From 493de2095a678436209185cd6ff5f52879f17523 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 18 Aug 2022 22:40:09 -0700 Subject: [PATCH 475/580] f2fs: complete checkpoints during remount Otherwise, pending checkpoints can contribute a race condition to give a quota warning. - Thread - checkpoint thread add checkpoints to the list do_remount() down_write(&sb->s_umount); f2fs_remount() block_operations() down_read_trylock(&sb->s_umount) = 0 up_write(&sb->s_umount); f2fs_quota_sync() dquot_writeback_dquots() WARN_ON_ONCE(!rwsem_is_locked(&sb->s_umount)); Or, do_remount() down_write(&sb->s_umount); f2fs_remount() create a ckpt thread f2fs_enable_checkpoint() adds checkpoints wait for f2fs_sync_fs() trigger another pending checkpoint block_operations() down_read_trylock(&sb->s_umount) = 0 up_write(&sb->s_umount); f2fs_quota_sync() dquot_writeback_dquots() WARN_ON_ONCE(!rwsem_is_locked(&sb->s_umount)); Cc: stable@vger.kernel.org Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 31f05d39c35d..139dfa74f28a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2164,6 +2164,9 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) f2fs_up_write(&sbi->gc_lock); f2fs_sync_fs(sbi->sb, 1); + + /* Let's ensure there's no pending checkpoint anymore */ + f2fs_flush_ckpt_thread(sbi); } static int f2fs_remount(struct super_block *sb, int *flags, char *data) @@ -2329,6 +2332,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) f2fs_stop_ckpt_thread(sbi); need_restart_ckpt = true; } else { + /* Flush if the prevous checkpoint, if exists. */ + f2fs_flush_ckpt_thread(sbi); + err = f2fs_start_ckpt_thread(sbi); if (err) { f2fs_err(sbi, From afd5e82a581994c3685716e6dd5a94d3a8d64152 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 23 Aug 2022 10:18:42 -0700 Subject: [PATCH 476/580] f2fs: increase the limit for reserve_root This patch increases the threshold that limits the reserved root space from 0.2% to 12.5% by using simple shift operation. Typically Android sets 128MB, but if the storage capacity is 32GB, 0.2% which is around 64MB becomes too small. Let's relax it. Cc: stable@vger.kernel.org Reported-by: Aran Dalton Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 139dfa74f28a..4134e5868fd8 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -304,10 +304,10 @@ static void f2fs_destroy_casefold_cache(void) { } static inline void limit_reserve_root(struct f2fs_sb_info *sbi) { - block_t limit = min((sbi->user_block_count << 1) / 1000, + block_t limit = min((sbi->user_block_count >> 3), sbi->user_block_count - sbi->reserved_blocks); - /* limit is 0.2% */ + /* limit is 12.5% */ if (test_opt(sbi, RESERVE_ROOT) && F2FS_OPTION(sbi).root_reserved_blocks > limit) { F2FS_OPTION(sbi).root_reserved_blocks = limit; From b09aa0b6c5fa23d55d36b799c01b21a44bbc4179 Mon Sep 17 00:00:00 2001 From: Zhang Qilong Date: Mon, 29 Aug 2022 21:31:20 +0800 Subject: [PATCH 477/580] f2fs: replace logical value "true" with a int number The "true" is not match the parametera type "int", and we modify it. Signed-off-by: Zhang Qilong Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 8ed985685d2e..2f29961df1fe 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -480,7 +480,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) mutex_unlock(&sbi->flush_lock); } - f2fs_sync_fs(sbi->sb, true); + f2fs_sync_fs(sbi->sb, 1); stat_inc_bg_cp_count(sbi->stat_info); } From 4e4215979f62ba1b9df302d7057fe552c5b38ad0 Mon Sep 17 00:00:00 2001 From: Zhang Qilong Date: Tue, 23 Aug 2022 19:20:22 +0800 Subject: [PATCH 478/580] f2fs: simplify code in f2fs_prepare_decomp_mem It could return directly after init_decompress_ctx. Signed-off-by: Zhang Qilong Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 466b0e1db95e..ab33d1346941 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1538,12 +1538,8 @@ static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic, if (!dic->cbuf) return -ENOMEM; - if (cops->init_decompress_ctx) { - int ret = cops->init_decompress_ctx(dic); - - if (ret) - return ret; - } + if (cops->init_decompress_ctx) + return cops->init_decompress_ctx(dic); return 0; } From 0f99b775bb55d1edde78a5ce9e7b8b62b94b168b Mon Sep 17 00:00:00 2001 From: Zhang Qilong Date: Tue, 30 Aug 2022 20:13:23 +0800 Subject: [PATCH 479/580] f2fs: return the tmp_ptr directly in __bitmap_ptr Just return tmp_ptr here, it's no need to dereference checkpoint pointer again. Signed-off-by: Zhang Qilong Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index dc66a85d9167..b8ae2b148ad4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2534,7 +2534,7 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) if (__cp_payload(sbi) > 0) { if (flag == NAT_BITMAP) - return &ckpt->sit_nat_version_bitmap; + return tmp_ptr; else return (unsigned char *)ckpt + F2FS_BLKSIZE; } else { From b50bd0ada36545a7505b95c323265fa59fa6c856 Mon Sep 17 00:00:00 2001 From: Zhang Qilong Date: Tue, 30 Aug 2022 14:55:15 +0800 Subject: [PATCH 480/580] f2fs: use COMPRESS_MAPPING to get compress cache mapping Just use the defined COMPRESS_MAPPING to get compress cache mapping instaed of direct accessing name. Signed-off-by: Zhang Qilong Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index ab33d1346941..361988d3b4d2 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1871,7 +1871,7 @@ bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, struct page *page, void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) { - struct address_space *mapping = sbi->compress_inode->i_mapping; + struct address_space *mapping = COMPRESS_MAPPING(sbi); struct pagevec pvec; pgoff_t index = 0; pgoff_t end = MAX_BLKADDR(sbi); From 4bcfb48b9a6f30dbbf260b430d50fcdb638821fd Mon Sep 17 00:00:00 2001 From: Shuqi Zhang Date: Wed, 31 Aug 2022 10:24:40 +0800 Subject: [PATCH 481/580] f2fs: fix wrong dirty page count when race between mmap and fallocate. This is a BUG_ON issue as follows when running xfstest-generic-503: WARNING: CPU: 21 PID: 1385 at fs/f2fs/inode.c:762 f2fs_evict_inode+0x847/0xaa0 Modules linked in: CPU: 21 PID: 1385 Comm: umount Not tainted 5.19.0-rc5+ #73 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-4.fc34 04/01/2014 Call Trace: evict+0x129/0x2d0 dispose_list+0x4f/0xb0 evict_inodes+0x204/0x230 generic_shutdown_super+0x5b/0x1e0 kill_block_super+0x29/0x80 kill_f2fs_super+0xe6/0x140 deactivate_locked_super+0x44/0xc0 deactivate_super+0x79/0x90 cleanup_mnt+0x114/0x1a0 __cleanup_mnt+0x16/0x20 task_work_run+0x98/0x100 exit_to_user_mode_prepare+0x3d0/0x3e0 syscall_exit_to_user_mode+0x12/0x30 do_syscall_64+0x42/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 Function flow analysis when BUG occurs: f2fs_fallocate mmap do_page_fault pte_spinlock // ---lock_pte do_wp_page wp_page_shared pte_unmap_unlock // unlock_pte do_page_mkwrite f2fs_vm_page_mkwrite down_read(invalidate_lock) lock_page if (PageMappedToDisk(page)) goto out; // set_page_dirty --NOT RUN out: up_read(invalidate_lock); finish_mkwrite_fault // unlock_pte f2fs_collapse_range down_write(i_mmap_sem) truncate_pagecache unmap_mapping_pages i_mmap_lock_write // down_write(i_mmap_rwsem) ...... zap_pte_range pte_offset_map_lock // ---lock_pte set_page_dirty f2fs_dirty_data_folio if (!folio_test_dirty(folio)) { fault_dirty_shared_page set_page_dirty f2fs_dirty_data_folio if (!folio_test_dirty(folio)) { filemap_dirty_folio f2fs_update_dirty_folio // ++ } unlock_page filemap_dirty_folio f2fs_update_dirty_folio // page count++ } pte_unmap_unlock // --unlock_pte i_mmap_unlock_write // up_write(i_mmap_rwsem) truncate_inode_pages up_write(i_mmap_sem) When race happens between mmap-do_page_fault-wp_page_shared and fallocate-truncate_pagecache-zap_pte_range, the zap_pte_range calls function set_page_dirty without page lock. Besides, though truncate_pagecache has immap and pte lock, wp_page_shared calls fault_dirty_shared_page without any. In this case, two threads race in f2fs_dirty_data_folio function. Page is set to dirty only ONCE, but the count is added TWICE by calling filemap_dirty_folio. Thus the count of dirty page cannot accord with the real dirty pages. Following is the solution to in case of race happens without any lock. Since folio_test_set_dirty in filemap_dirty_folio is atomic, judge return value will not be at risk of race. Signed-off-by: Shuqi Zhang Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 3 +-- fs/f2fs/data.c | 3 +-- fs/f2fs/node.c | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index bca519378778..0d6846e60cae 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -447,8 +447,7 @@ static int f2fs_set_meta_page_dirty(struct page *page) if (!PageUptodate(page)) SetPageUptodate(page); - if (!PageDirty(page)) { - __set_page_dirty_nobuffers(page); + if (__set_page_dirty_nobuffers(page)) { inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); set_page_private_reference(page); return 1; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 80604b818ada..c2f06c3090ca 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3897,8 +3897,7 @@ static int f2fs_set_data_page_dirty(struct page *page) if (PageSwapCache(page)) return __set_page_dirty_nobuffers(page); - if (!PageDirty(page)) { - __set_page_dirty_nobuffers(page); + if (__set_page_dirty_nobuffers(page)) { f2fs_update_dirty_page(inode, page); return 1; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 69b011a461a8..7da4fa217e27 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2146,8 +2146,7 @@ static int f2fs_set_node_page_dirty(struct page *page) if (IS_INODE(page)) f2fs_inode_chksum_set(F2FS_P_SB(page), page); #endif - if (!PageDirty(page)) { - __set_page_dirty_nobuffers(page); + if (__set_page_dirty_nobuffers(page)) { inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); set_page_private_reference(page); return 1; From a5dee7437c23eeb87e609f58aad0348d61c77bb2 Mon Sep 17 00:00:00 2001 From: Yonggil Song Date: Fri, 2 Sep 2022 11:07:49 +0900 Subject: [PATCH 482/580] f2fs: fix typo Fix typo in f2fs.h Detected by Jaeyoon Choi Signed-off-by: Yonggil Song Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b8ae2b148ad4..86c7b0e6ff7e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -270,7 +270,7 @@ enum { ORPHAN_INO, /* for orphan ino list */ APPEND_INO, /* for append ino list */ UPDATE_INO, /* for update ino list */ - TRANS_DIR_INO, /* for trasactions dir ino list */ + TRANS_DIR_INO, /* for transactions dir ino list */ FLUSH_INO, /* for multiple device flushing */ MAX_INO_ENTRY, /* max. list */ }; From 6c976810df48b0a7b8fee0adb8671a9b2020d5d5 Mon Sep 17 00:00:00 2001 From: Zhang Qilong Date: Wed, 31 Aug 2022 17:48:15 +0800 Subject: [PATCH 483/580] f2fs: add static init_idisk_time function to reduce the code We can use a inner function to init the disk time of f2fs_inode_info for cleaning redundant code. Signed-off-by: Zhang Qilong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 615c51a2ff2b..80bc0e35320b 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -333,6 +333,16 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return true; } +static void init_idisk_time(struct inode *inode) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + + fi->i_disk_time[0] = inode->i_atime; + fi->i_disk_time[1] = inode->i_ctime; + fi->i_disk_time[2] = inode->i_mtime; + fi->i_disk_time[3] = fi->i_crtime; +} + static int do_read_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -465,10 +475,7 @@ static int do_read_inode(struct inode *inode) } } - fi->i_disk_time[0] = inode->i_atime; - fi->i_disk_time[1] = inode->i_ctime; - fi->i_disk_time[2] = inode->i_mtime; - fi->i_disk_time[3] = fi->i_crtime; + init_idisk_time(inode); f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); @@ -676,11 +683,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) if (inode->i_nlink == 0) clear_page_private_inline(node_page); - F2FS_I(inode)->i_disk_time[0] = inode->i_atime; - F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; - F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; - F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; - + init_idisk_time(inode); #ifdef CONFIG_F2FS_CHECK_FS f2fs_inode_chksum_set(F2FS_I_SB(inode), node_page); #endif From c5151e43325c80cced37976b01bd97018397e61a Mon Sep 17 00:00:00 2001 From: Zhang Qilong Date: Thu, 1 Sep 2022 15:19:37 +0800 Subject: [PATCH 484/580] f2fs: remove redundant check in f2fs_sanity_check_cluster It have checked "compressed" at the entry of f2fs_sanity_check_cluster, just remove the redundant check for better performance here. Signed-off-by: Zhang Qilong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 361988d3b4d2..c0d44fbcba56 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -882,17 +882,15 @@ bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) reason = "[C|*|C|*]"; goto out; } - if (compressed) { - if (!__is_valid_data_blkaddr(blkaddr)) { - if (!cluster_end) - cluster_end = i; - continue; - } - /* [COMPR_ADDR, NULL_ADDR or NEW_ADDR, valid_blkaddr] */ - if (cluster_end) { - reason = "[C|N|N|V]"; - goto out; - } + if (!__is_valid_data_blkaddr(blkaddr)) { + if (!cluster_end) + cluster_end = i; + continue; + } + /* [COMPR_ADDR, NULL_ADDR or NEW_ADDR, valid_blkaddr] */ + if (cluster_end) { + reason = "[C|N|N|V]"; + goto out; } } return false; From 50d9025b845f1b92288583912d873572aec57fdb Mon Sep 17 00:00:00 2001 From: Zhang Qilong Date: Mon, 5 Sep 2022 12:59:17 +0800 Subject: [PATCH 485/580] f2fs: fix race condition on setting FI_NO_EXTENT flag The following scenarios exist. process A: process B: ->f2fs_drop_extent_tree ->f2fs_update_extent_cache_range ->f2fs_update_extent_tree_range ->write_lock ->set_inode_flag ->is_inode_flag_set ->__free_extent_tree // Shouldn't // have been // cleaned up // here ->write_lock In this case, the "FI_NO_EXTENT" flag is set between f2fs_update_extent_tree_range and is_inode_flag_set by other process. it leads to clearing the whole exten tree which should not have happened. And we fix it by move the setting it to the range of write_lock. Fixes:5f281fab9b9a3 ("f2fs: disable extent_cache for fcollapse/finsert inodes") Signed-off-by: Zhang Qilong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 866e72b29bd5..761fd42c93f2 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -804,9 +804,8 @@ void f2fs_drop_extent_tree(struct inode *inode) if (!f2fs_may_extent_tree(inode)) return; - set_inode_flag(inode, FI_NO_EXTENT); - write_lock(&et->lock); + set_inode_flag(inode, FI_NO_EXTENT); __free_extent_tree(sbi, et); if (et->largest.len) { et->largest.len = 0; From a282922a558ac0420030fe7a195bbcf3a0d44b12 Mon Sep 17 00:00:00 2001 From: Weichao Guo Date: Wed, 7 Sep 2022 10:38:48 +0800 Subject: [PATCH 486/580] f2fs: let FI_OPU_WRITE override FADVISE_COLD_BIT Cold files may be fragmented due to SSR, defragment is needed as sequential reads are dominant scenarios of these files. FI_OPU_WRITE should override FADVISE_COLD_BIT to avoid defragment fails. Signed-off-by: Weichao Guo Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index c2f06c3090ca..575d203047d5 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2597,7 +2597,7 @@ bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio) return true; /* if this is cold file, we should overwrite to avoid fragmentation */ - if (file_is_cold(inode)) + if (file_is_cold(inode) && !is_inode_flag_set(inode, FI_OPU_WRITE)) return true; return check_inplace_update_policy(inode, fio); From f24a4cd65f1fd0755fa3b3d8b8bd5a82868793fa Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 13 Sep 2022 10:08:41 +0800 Subject: [PATCH 487/580] f2fs: fix to do sanity check on destination blkaddr during recovery As Wenqing Liu reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=216456 loop5: detected capacity change from 0 to 131072 F2FS-fs (loop5): recover_inode: ino = 6, name = hln, inline = 1 F2FS-fs (loop5): recover_data: ino = 6 (i_size: recover) err = 0 F2FS-fs (loop5): recover_inode: ino = 6, name = hln, inline = 1 F2FS-fs (loop5): recover_data: ino = 6 (i_size: recover) err = 0 F2FS-fs (loop5): recover_inode: ino = 6, name = hln, inline = 1 F2FS-fs (loop5): recover_data: ino = 6 (i_size: recover) err = 0 F2FS-fs (loop5): Bitmap was wrongly set, blk:5634 ------------[ cut here ]------------ WARNING: CPU: 3 PID: 1013 at fs/f2fs/segment.c:2198 RIP: 0010:update_sit_entry+0xa55/0x10b0 [f2fs] Call Trace: f2fs_do_replace_block+0xa98/0x1890 [f2fs] f2fs_replace_block+0xeb/0x180 [f2fs] recover_data+0x1a69/0x6ae0 [f2fs] f2fs_recover_fsync_data+0x120d/0x1fc0 [f2fs] f2fs_fill_super+0x4665/0x61e0 [f2fs] mount_bdev+0x2cf/0x3b0 legacy_get_tree+0xed/0x1d0 vfs_get_tree+0x81/0x2b0 path_mount+0x47e/0x19d0 do_mount+0xce/0xf0 __x64_sys_mount+0x12c/0x1a0 do_syscall_64+0x38/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd If we enable CONFIG_F2FS_CHECK_FS config, it will trigger a kernel panic instead of warning. The root cause is: in fuzzed image, SIT table is inconsistent with inode mapping table, result in triggering such warning during SIT table update. This patch introduces a new flag DATA_GENERIC_ENHANCE_UPDATE, w/ this flag, data block recovery flow can check destination blkaddr's validation in SIT table, and skip f2fs_replace_block() to avoid inconsistent status. Cc: Reported-by: Wenqing Liu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 10 +++++++++- fs/f2fs/f2fs.h | 4 ++++ fs/f2fs/recovery.c | 8 ++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 0d6846e60cae..c6a0295f3f10 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -140,7 +140,7 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr, unsigned int segno, offset; bool exist; - if (type != DATA_GENERIC_ENHANCE && type != DATA_GENERIC_ENHANCE_READ) + if (type == DATA_GENERIC) return true; segno = GET_SEGNO(sbi, blkaddr); @@ -148,6 +148,13 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr, se = get_seg_entry(sbi, segno); exist = f2fs_test_bit(offset, se->cur_valid_map); + if (exist && type == DATA_GENERIC_ENHANCE_UPDATE) { + f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d", + blkaddr, exist); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return exist; + } + if (!exist && type == DATA_GENERIC_ENHANCE) { f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d", blkaddr, exist); @@ -185,6 +192,7 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, case DATA_GENERIC: case DATA_GENERIC_ENHANCE: case DATA_GENERIC_ENHANCE_READ: + case DATA_GENERIC_ENHANCE_UPDATE: if (unlikely(blkaddr >= MAX_BLKADDR(sbi) || blkaddr < MAIN_BLKADDR(sbi))) { f2fs_warn(sbi, "access invalid blkaddr:%u", diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 86c7b0e6ff7e..30496be11018 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -262,6 +262,10 @@ enum { * condition of read on truncated area * by extent_cache */ + DATA_GENERIC_ENHANCE_UPDATE, /* + * strong check on range and segment + * bitmap for update case + */ META_GENERIC, }; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index fc85d8f8ce65..dcd6a0d7e9ad 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -698,6 +698,14 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, goto err; } + if (f2fs_is_valid_blkaddr(sbi, dest, + DATA_GENERIC_ENHANCE_UPDATE)) { + f2fs_err(sbi, "Inconsistent dest blkaddr:%u, ino:%lu, ofs:%u", + dest, inode->i_ino, dn.ofs_in_node); + err = -EFSCORRUPTED; + goto err; + } + /* write dummy data page */ f2fs_replace_block(sbi, &dn, src, dest, ni.version, false, false); From 36f6ef697427b741748e3b20d7e1618892c5f7b4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 14 Sep 2022 19:51:51 +0800 Subject: [PATCH 488/580] f2fs: fix to do sanity check on summary info As Wenqing Liu reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=216456 BUG: KASAN: use-after-free in recover_data+0x63ae/0x6ae0 [f2fs] Read of size 4 at addr ffff8881464dcd80 by task mount/1013 CPU: 3 PID: 1013 Comm: mount Tainted: G W 6.0.0-rc4 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.15.0-1 04/01/2014 Call Trace: dump_stack_lvl+0x45/0x5e print_report.cold+0xf3/0x68d kasan_report+0xa8/0x130 recover_data+0x63ae/0x6ae0 [f2fs] f2fs_recover_fsync_data+0x120d/0x1fc0 [f2fs] f2fs_fill_super+0x4665/0x61e0 [f2fs] mount_bdev+0x2cf/0x3b0 legacy_get_tree+0xed/0x1d0 vfs_get_tree+0x81/0x2b0 path_mount+0x47e/0x19d0 do_mount+0xce/0xf0 __x64_sys_mount+0x12c/0x1a0 do_syscall_64+0x38/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd The root cause is: in fuzzed image, SSA table is corrupted: ofs_in_node is larger than ADDRS_PER_PAGE(), result in out-of-range access on 4k-size page. - recover_data - do_recover_data - check_index_in_prev_nodes - f2fs_data_blkaddr This patch adds sanity check on summary info in recovery and GC flow in where the flows rely on them. After patch: [ 29.310883] F2FS-fs (loop0): Inconsistent ofs_in_node:65286 in summary, ino:0, nid:6, max:1018 Cc: Reported-by: Wenqing Liu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 10 +++++++++- fs/f2fs/recovery.c | 15 ++++++++++++--- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index faec6af53b21..90e1b4af72ee 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1078,7 +1078,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, { struct page *node_page; nid_t nid; - unsigned int ofs_in_node; + unsigned int ofs_in_node, max_addrs; block_t source_blkaddr; nid = le32_to_cpu(sum->nid); @@ -1104,6 +1104,14 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return false; } + max_addrs = IS_INODE(node_page) ? DEF_ADDRS_PER_INODE : + DEF_ADDRS_PER_BLOCK; + if (ofs_in_node >= max_addrs) { + f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%u, nid:%u, max:%u", + ofs_in_node, dni->ino, dni->nid, max_addrs); + return false; + } + *nofs = ofs_of_node(node_page); source_blkaddr = data_blkaddr(NULL, node_page, ofs_in_node); f2fs_put_page(node_page, 1); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index dcd6a0d7e9ad..fb35375c3e21 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -473,7 +473,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, struct dnode_of_data tdn = *dn; nid_t ino, nid; struct inode *inode; - unsigned int offset; + unsigned int offset, ofs_in_node, max_addrs; block_t bidx; int i; @@ -500,15 +500,24 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, got_it: /* Use the locked dnode page and inode */ nid = le32_to_cpu(sum.nid); + ofs_in_node = le16_to_cpu(sum.ofs_in_node); + + max_addrs = ADDRS_PER_PAGE(dn->node_page, dn->inode); + if (ofs_in_node >= max_addrs) { + f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%lu, nid:%u, max:%u", + ofs_in_node, dn->inode->i_ino, nid, max_addrs); + return -EFSCORRUPTED; + } + if (dn->inode->i_ino == nid) { tdn.nid = nid; if (!dn->inode_page_locked) lock_page(dn->inode_page); tdn.node_page = dn->inode_page; - tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node); + tdn.ofs_in_node = ofs_in_node; goto truncate_out; } else if (dn->nid == nid) { - tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node); + tdn.ofs_in_node = ofs_in_node; goto truncate_out; } From 90dfa8540fda1e539a599c97c7dc11e3033b66b1 Mon Sep 17 00:00:00 2001 From: Zhang Qilong Date: Mon, 19 Sep 2022 19:57:09 +0800 Subject: [PATCH 489/580] f2fs: add "c_len" into trace_f2fs_update_extent_tree_range for compressed file The trace_f2fs_update_extent_tree_range could not record compressed block length in the cluster of compress file and we just add it. Signed-off-by: Zhang Qilong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 4 ++-- include/trace/events/f2fs.h | 13 +++++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 761fd42c93f2..746abfda3b66 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -544,7 +544,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode, if (!et) return; - trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len); + trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len, 0); write_lock(&et->lock); @@ -675,7 +675,7 @@ void f2fs_update_extent_tree_range_compressed(struct inode *inode, struct rb_node **insert_p = NULL, *insert_parent = NULL; bool leftmost = false; - trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, llen); + trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, llen, c_len); /* it is safe here to check FI_NO_EXTENT w/o et->lock in ro image */ if (is_inode_flag_set(inode, FI_NO_EXTENT)) diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index cc4a803e7c0a..da8147c649cd 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1589,9 +1589,10 @@ TRACE_EVENT_CONDITION(f2fs_lookup_extent_tree_end, TRACE_EVENT(f2fs_update_extent_tree_range, TP_PROTO(struct inode *inode, unsigned int pgofs, block_t blkaddr, - unsigned int len), + unsigned int len, + unsigned int c_len), - TP_ARGS(inode, pgofs, blkaddr, len), + TP_ARGS(inode, pgofs, blkaddr, len, c_len), TP_STRUCT__entry( __field(dev_t, dev) @@ -1599,6 +1600,7 @@ TRACE_EVENT(f2fs_update_extent_tree_range, __field(unsigned int, pgofs) __field(u32, blk) __field(unsigned int, len) + __field(unsigned int, c_len) ), TP_fast_assign( @@ -1607,14 +1609,17 @@ TRACE_EVENT(f2fs_update_extent_tree_range, __entry->pgofs = pgofs; __entry->blk = blkaddr; __entry->len = len; + __entry->c_len = c_len; ), TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, " - "blkaddr = %u, len = %u", + "blkaddr = %u, len = %u, " + "c_len = %u", show_dev_ino(__entry), __entry->pgofs, __entry->blk, - __entry->len) + __entry->len, + __entry->c_len) ); TRACE_EVENT(f2fs_shrink_extent_tree, From ec7347bf2b32d6a5fd3b3eb7594d5084909a3f15 Mon Sep 17 00:00:00 2001 From: Zhang Qilong Date: Wed, 14 Sep 2022 09:33:22 +0800 Subject: [PATCH 490/580] f2fs: code clean and fix a type error ERROR: code indent should use tabs where possible ERROR: spaces required around that ':' ERROR: incorrect tab Found serveral code type errors when review the code and fix it. There is no function change. Signed-off-by: Zhang Qilong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/debug.c | 2 +- fs/f2fs/extent_cache.c | 2 +- fs/f2fs/node.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 575d203047d5..980945cc6441 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -701,7 +701,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) wbc_account_io(fio->io_wbc, page, PAGE_SIZE); inc_page_count(fio->sbi, is_read_io(fio->op) ? - __read_io_type(page): WB_DATA_TYPE(fio->page)); + __read_io_type(page) : WB_DATA_TYPE(fio->page)); __submit_bio(fio->sbi, bio, fio->type); return 0; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index c01471573977..29cf5b6b2341 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -347,7 +347,7 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s]=====\n", si->sbi->sb->s_bdev, i++, - f2fs_readonly(si->sbi->sb) ? "RO": "RW", + f2fs_readonly(si->sbi->sb) ? "RO" : "RW", is_set_ckpt_flags(si->sbi, CP_DISABLED_FLAG) ? "Disabled" : (f2fs_cp_error(si->sbi) ? "Error" : "Good")); if (si->sbi->s_flag) { diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 746abfda3b66..932c070173b9 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -583,7 +583,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode, org_end = dei.fofs + dei.len; f2fs_bug_on(sbi, pos >= org_end); - if (pos > dei.fofs && pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) { + if (pos > dei.fofs && pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) { en->ei.len = pos - en->ei.fofs; prev_en = en; parts = 1; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 7da4fa217e27..328801c73fd3 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -585,7 +585,7 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, ne = nat_in_journal(journal, i); node_info_from_raw_nat(ni, &ne); } - up_read(&curseg->journal_rwsem); + up_read(&curseg->journal_rwsem); if (i >= 0) { f2fs_up_read(&nm_i->nat_tree_lock); goto cache; From 3c1a2f5a75aea0f3d236f76b04d1d9f3073b51e4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 14 Sep 2022 21:28:46 +0800 Subject: [PATCH 491/580] f2fs: fix to account FS_CP_DATA_IO correctly f2fs_inode_info.cp_task was introduced for FS_CP_DATA_IO accounting since commit b0af6d491a6b ("f2fs: add app/fs io stat"). However, cp_task usage coverage has been increased due to below commits: commit 040d2bb318d1 ("f2fs: fix to avoid deadloop if data_flush is on") commit 186857c5a14a ("f2fs: fix potential recursive call when enabling data_flush") So that, if data_flush mountoption is on, when data flush was triggered from background, the IO from data flush will be accounted as checkpoint IO type incorrectly. In order to fix this issue, this patch splits cp_task into two: a) cp_task: used for IO accounting b) wb_task: used to avoid deadlock Fixes: commit 040d2bb318d1 ("f2fs: fix to avoid deadloop if data_flush is on") Fixes: commit 186857c5a14a ("f2fs: fix potential recursive call when enabling data_flush") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 13 +++++++++---- fs/f2fs/data.c | 4 ++-- fs/f2fs/f2fs.h | 4 +++- fs/f2fs/segment.c | 2 +- 4 files changed, 15 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index c6a0295f3f10..35a2ba18f28d 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1061,7 +1061,8 @@ void f2fs_remove_dirty_inode(struct inode *inode) spin_unlock(&sbi->inode_lock[type]); } -int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) +int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type, + bool from_cp) { struct list_head *head; struct inode *inode; @@ -1096,11 +1097,15 @@ int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) if (inode) { unsigned long cur_ino = inode->i_ino; - F2FS_I(inode)->cp_task = current; + if (from_cp) + F2FS_I(inode)->cp_task = current; + F2FS_I(inode)->wb_task = current; filemap_fdatawrite(inode->i_mapping); - F2FS_I(inode)->cp_task = NULL; + F2FS_I(inode)->wb_task = NULL; + if (from_cp) + F2FS_I(inode)->cp_task = NULL; iput(inode); /* We need to give cpu to another writers. */ @@ -1229,7 +1234,7 @@ static int block_operations(struct f2fs_sb_info *sbi) /* write all the dirty dentry pages */ if (get_pages(sbi, F2FS_DIRTY_DENTS)) { f2fs_unlock_all(sbi); - err = f2fs_sync_dirty_inodes(sbi, DIR_INODE); + err = f2fs_sync_dirty_inodes(sbi, DIR_INODE, true); if (err) return err; cond_resched(); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 980945cc6441..cba9eb643998 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2910,7 +2910,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, } unlock_page(page); if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) && - !F2FS_I(inode)->cp_task && allow_balance) + !F2FS_I(inode)->wb_task && allow_balance) f2fs_balance_fs(sbi, need_balance_fs); if (unlikely(f2fs_cp_error(sbi))) { @@ -3212,7 +3212,7 @@ static inline bool __should_serialize_io(struct inode *inode, struct writeback_control *wbc) { /* to avoid deadlock in path of data flush */ - if (F2FS_I(inode)->cp_task) + if (F2FS_I(inode)->wb_task) return false; if (!S_ISREG(inode->i_mode)) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 30496be11018..9bd938169db2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -782,6 +782,7 @@ struct f2fs_inode_info { unsigned int clevel; /* maximum level of given file name */ struct task_struct *task; /* lookup and create consistency */ struct task_struct *cp_task; /* separate cp/wb IO stats*/ + struct task_struct *wb_task; /* indicate inode is in context of writeback */ nid_t i_xattr_nid; /* node id that contains xattrs */ loff_t last_disk_size; /* lastly written file size */ spinlock_t i_size_lock; /* protect last_disk_size */ @@ -3783,7 +3784,8 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi); int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi); void f2fs_update_dirty_page(struct inode *inode, struct page *page); void f2fs_remove_dirty_inode(struct inode *inode); -int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type); +int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type, + bool from_cp); void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type); u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi); int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 2f29961df1fe..c3847e84f782 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -475,7 +475,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) mutex_lock(&sbi->flush_lock); blk_start_plug(&plug); - f2fs_sync_dirty_inodes(sbi, FILE_INODE); + f2fs_sync_dirty_inodes(sbi, FILE_INODE, false); blk_finish_plug(&plug); mutex_unlock(&sbi->flush_lock); From 191733f538b631db07fcc917bce01bb9a3605f00 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 13 Sep 2022 15:48:12 +0800 Subject: [PATCH 492/580] f2fs: fix to detect corrupted meta ino It is possible that ino of dirent or orphan inode is corrupted in a fuzzed image, occasionally, if corrupted ino is equal to meta ino: meta_ino, node_ino or compress_ino, caller of f2fs_iget() from below call paths will get meta inode directly, it's not allowed, let's add sanity check to detect such cases. case #1 - recover_dentry - __f2fs_find_entry - f2fs_iget_retry case #2 - recover_orphan_inode - f2fs_iget_retry Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 80bc0e35320b..dc7cd1bbb128 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -487,6 +487,12 @@ static int do_read_inode(struct inode *inode) return 0; } +static bool is_meta_ino(struct f2fs_sb_info *sbi, unsigned int ino) +{ + return ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi) || + ino == F2FS_COMPRESS_INO(sbi); +} + struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -498,16 +504,21 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) { + if (is_meta_ino(sbi, ino)) { + f2fs_err(sbi, "inaccessible inode: %lu, run fsck to repair", ino); + set_sbi_flag(sbi, SBI_NEED_FSCK); + ret = -EFSCORRUPTED; + trace_f2fs_iget_exit(inode, ret); + iput(inode); + return ERR_PTR(ret); + } + trace_f2fs_iget(inode); return inode; } - if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi)) - goto make_now; -#ifdef CONFIG_F2FS_FS_COMPRESSION - if (ino == F2FS_COMPRESS_INO(sbi)) + if (is_meta_ino(sbi, ino)) goto make_now; -#endif ret = do_read_inode(inode); if (ret) From 46ec37541fc82424adae5fd8234352dc20772f43 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 27 Sep 2022 10:44:47 +0800 Subject: [PATCH 493/580] f2fs: introduce cp_status sysfs entry This patch adds a new sysfs entry named cp_status, it can output checkpoint flags in real time. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 24 ++++++++++++++++++++++++ fs/f2fs/sysfs.c | 8 ++++++++ 2 files changed, 32 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 8c7428f9d4dc..e7b5c1fd661e 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -453,6 +453,30 @@ Description: Show status of f2fs superblock in real time. 0x4000 SBI_IS_FREEZING freefs is in process ====== ===================== ================================= +What: /sys/fs/f2fs//stat/cp_status +Date: September 2022 +Contact: "Chao Yu" +Description: Show status of f2fs checkpoint in real time. + + =============================== ============================== + cp flag value + CP_UMOUNT_FLAG 0x00000001 + CP_ORPHAN_PRESENT_FLAG 0x00000002 + CP_COMPACT_SUM_FLAG 0x00000004 + CP_ERROR_FLAG 0x00000008 + CP_FSCK_FLAG 0x00000010 + CP_FASTBOOT_FLAG 0x00000020 + CP_CRC_RECOVERY_FLAG 0x00000040 + CP_NAT_BITS_FLAG 0x00000080 + CP_TRIMMED_FLAG 0x00000100 + CP_NOCRC_RECOVERY_FLAG 0x00000200 + CP_LARGE_NAT_BITMAP_FLAG 0x00000400 + CP_QUOTA_NEED_FSCK_FLAG 0x00000800 + CP_DISABLED_FLAG 0x00001000 + CP_DISABLED_QUICK_FLAG 0x00002000 + CP_RESIZEFS_FLAG 0x00004000 + =============================== ============================== + What: /sys/fs/f2fs//ckpt_thread_ioprio Date: January 2021 Contact: "Daeho Jeong" diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 18ce23dd17e4..e5191589aa29 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -128,6 +128,12 @@ static ssize_t sb_status_show(struct f2fs_attr *a, return sprintf(buf, "%lx\n", sbi->s_flag); } +static ssize_t cp_status_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sprintf(buf, "%x\n", le32_to_cpu(F2FS_CKPT(sbi)->ckpt_flags)); +} + static ssize_t pending_discard_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -1020,8 +1026,10 @@ static struct attribute *f2fs_feat_attrs[] = { }; F2FS_GENERAL_RO_ATTR(sb_status); +F2FS_GENERAL_RO_ATTR(cp_status); static struct attribute *f2fs_stat_attrs[] = { ATTR_LIST(sb_status), + ATTR_LIST(cp_status), NULL, }; From f31fda15cf1ba4dafdf83b6a09453678cb270bab Mon Sep 17 00:00:00 2001 From: Zhang Qilong Date: Fri, 23 Sep 2022 15:17:55 +0800 Subject: [PATCH 494/580] f2fs: remove the unnecessary check in f2fs_xattr_fiemap Whehter or not error occurs, checking "err == 1" is unnecessary in f2fs_xattr_fiemap(), and just remove it here. Signed-off-by: Zhang Qilong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index cba9eb643998..a4d809718e62 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1829,7 +1829,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags); trace_f2fs_fiemap(inode, 0, phys, len, flags, err); - if (err || err == 1) + if (err) return err; } From 744d418bae54d8e6e94be07d22554ffa5f0298cd Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 28 Sep 2022 23:38:53 +0800 Subject: [PATCH 495/580] f2fs: support recording stop_checkpoint reason into super_block This patch supports to record stop_checkpoint error into f2fs_super_block.s_stop_reason[]. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 10 +++++++--- fs/f2fs/data.c | 6 ++++-- fs/f2fs/f2fs.h | 4 +++- fs/f2fs/file.c | 12 +++++++----- fs/f2fs/gc.c | 6 ++++-- fs/f2fs/inode.c | 3 ++- fs/f2fs/segment.c | 5 +++-- fs/f2fs/super.c | 20 ++++++++++++++++++++ include/linux/f2fs_fs.h | 17 ++++++++++++++++- 9 files changed, 66 insertions(+), 17 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 35a2ba18f28d..32828de6f7e2 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -26,12 +26,16 @@ static struct kmem_cache *ino_entry_slab; struct kmem_cache *f2fs_inode_entry_slab; -void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) +void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io, + unsigned char reason) { f2fs_build_fault_attr(sbi, 0, 0); set_ckpt_flags(sbi, CP_ERROR_FLAG); - if (!end_io) + if (!end_io) { f2fs_flush_merged_writes(sbi); + + f2fs_handle_stop(sbi, reason); + } } /* @@ -122,7 +126,7 @@ struct page *f2fs_get_meta_page_retry(struct f2fs_sb_info *sbi, pgoff_t index) if (PTR_ERR(page) == -EIO && ++count <= DEFAULT_RETRY_IO_COUNT) goto retry; - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_META_PAGE); } return page; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a4d809718e62..f6c0465e1a51 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -331,7 +331,8 @@ static void f2fs_write_end_io(struct bio *bio) mempool_free(page, sbi->write_io_dummy); if (unlikely(bio->bi_status)) - f2fs_stop_checkpoint(sbi, true); + f2fs_stop_checkpoint(sbi, true, + STOP_CP_REASON_WRITE_FAIL); continue; } @@ -347,7 +348,8 @@ static void f2fs_write_end_io(struct bio *bio) if (unlikely(bio->bi_status)) { mapping_set_error(page->mapping, -EIO); if (type == F2FS_WB_CP_DATA) - f2fs_stop_checkpoint(sbi, true); + f2fs_stop_checkpoint(sbi, true, + STOP_CP_REASON_WRITE_FAIL); } f2fs_bug_on(sbi, page->mapping == NODE_MAPPING(sbi) && diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9bd938169db2..429ec0ff1e52 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3596,6 +3596,7 @@ int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly); int f2fs_quota_sync(struct super_block *sb, int type); loff_t max_file_blocks(struct inode *inode); void f2fs_quota_off_umount(struct super_block *sb); +void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi); @@ -3753,7 +3754,8 @@ static inline bool f2fs_need_rand_seg(struct f2fs_sb_info *sbi) /* * checkpoint.c */ -void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io); +void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io, + unsigned char reason); void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi); struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c299b98f8d93..f1bb4ad8951f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2222,7 +2222,8 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) if (ret) { if (ret == -EROFS) { ret = 0; - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_SHUTDOWN); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); trace_f2fs_shutdown(sbi, in, ret); } @@ -2238,7 +2239,8 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) goto out; } if (sb) { - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_SHUTDOWN); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); thaw_bdev(sb->s_bdev, sb); } @@ -2248,16 +2250,16 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) ret = f2fs_sync_fs(sb, 1); if (ret) goto out; - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; case F2FS_GOING_DOWN_NOSYNC: - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; case F2FS_GOING_DOWN_METAFLUSH: f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO); - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; case F2FS_GOING_DOWN_NEED_FSCK: diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 90e1b4af72ee..47746a343444 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -74,7 +74,8 @@ static int gc_thread_func(void *data) if (time_to_inject(sbi, FAULT_CHECKPOINT)) { f2fs_show_injection_info(sbi, FAULT_CHECKPOINT); - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_FAULT_INJECT); } if (!sb_start_write_trylock(sbi->sb)) { @@ -1711,7 +1712,8 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, f2fs_err(sbi, "Inconsistent segment (%u) type [%d, %d] in SSA and SIT", segno, type, GET_SUM_TYPE((&sum->footer))); set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_CORRUPTED_SUMMARY); goto skip; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index dc7cd1bbb128..873e17a2b8e8 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -713,7 +713,8 @@ void f2fs_update_inode_page(struct inode *inode) cond_resched(); goto retry; } else if (err != -ENOENT) { - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_UPDATE_INODE); } return; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c3847e84f782..a69227879442 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -375,7 +375,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { if (time_to_inject(sbi, FAULT_CHECKPOINT)) { f2fs_show_injection_info(sbi, FAULT_CHECKPOINT); - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_FAULT_INJECT); } /* balance_fs_bg is able to be pending */ @@ -693,7 +693,8 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi) } while (ret && --count); if (ret) { - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_FLUSH_FAIL); break; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4134e5868fd8..caa5b438491b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3807,6 +3807,26 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) return err; } +void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason) +{ + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + int err; + + f2fs_bug_on(sbi, reason >= MAX_STOP_REASON); + + f2fs_down_write(&sbi->sb_lock); + + if (raw_super->s_stop_reason[reason] < ((1 << BITS_PER_BYTE) - 1)) + raw_super->s_stop_reason[reason]++; + + err = f2fs_commit_super(sbi, false); + if (err) + f2fs_err(sbi, "f2fs_commit_super fails to record reason:%u err:%d", + reason, err); + + f2fs_up_write(&sbi->sb_lock); +} + static int f2fs_scan_devices(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index d445150c5350..5dd1e52b8997 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -73,6 +73,20 @@ struct f2fs_device { __le32 total_segments; } __packed; +/* reason of stop_checkpoint */ +enum stop_cp_reason { + STOP_CP_REASON_SHUTDOWN, + STOP_CP_REASON_FAULT_INJECT, + STOP_CP_REASON_META_PAGE, + STOP_CP_REASON_WRITE_FAIL, + STOP_CP_REASON_CORRUPTED_SUMMARY, + STOP_CP_REASON_UPDATE_INODE, + STOP_CP_REASON_FLUSH_FAIL, + STOP_CP_REASON_MAX, +}; + +#define MAX_STOP_REASON 32 + struct f2fs_super_block { __le32 magic; /* Magic Number */ __le16 major_ver; /* Major Version */ @@ -116,7 +130,8 @@ struct f2fs_super_block { __u8 hot_ext_count; /* # of hot file extension */ __le16 s_encoding; /* Filename charset encoding */ __le16 s_encoding_flags; /* Filename charset encoding flags */ - __u8 reserved[306]; /* valid reserved region */ + __u8 s_stop_reason[MAX_STOP_REASON]; /* stop checkpoint reason */ + __u8 reserved[274]; /* valid reserved region */ __le32 crc; /* checksum of superblock */ } __packed; From b1983d09aaa682af713aeb488fb26473d914b93f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 28 Sep 2022 23:38:54 +0800 Subject: [PATCH 496/580] f2fs: support recording errors into superblock This patch supports to record detail reason of FSCORRUPTED error into f2fs_super_block.s_errors[]. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 2 ++ fs/f2fs/data.c | 24 +++++++++++++++++--- fs/f2fs/dir.c | 1 + fs/f2fs/f2fs.h | 5 +++++ fs/f2fs/file.c | 12 ++++++++-- fs/f2fs/gc.c | 2 ++ fs/f2fs/inline.c | 2 ++ fs/f2fs/inode.c | 6 ++++- fs/f2fs/node.c | 2 ++ fs/f2fs/recovery.c | 6 +++++ fs/f2fs/segment.c | 11 +++++++++ fs/f2fs/segment.h | 2 ++ fs/f2fs/super.c | 49 +++++++++++++++++++++++++++++++++++++++-- fs/f2fs/verity.c | 2 ++ fs/f2fs/xattr.c | 8 +++++++ include/linux/f2fs_fs.h | 25 ++++++++++++++++++++- 16 files changed, 150 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index c0d44fbcba56..4b0b655d8487 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -732,6 +732,7 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) if (dic->clen > PAGE_SIZE * dic->nr_cpages - COMPRESS_HEADER_SIZE) { ret = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_FAIL_DECOMPRESSION); goto out_release; } @@ -920,6 +921,7 @@ static int __f2fs_cluster_blocks(struct inode *inode, if (f2fs_sanity_check_cluster(&dn)) { ret = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), ERROR_CORRUPTED_CLUSTER); goto fail; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f6c0465e1a51..0e03bfe48570 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -686,8 +686,10 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, fio->is_por ? META_POR : (__is_meta_io(fio) ? - META_GENERIC : DATA_GENERIC_ENHANCE))) + META_GENERIC : DATA_GENERIC_ENHANCE))) { + f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } trace_f2fs_submit_page_bio(page, fio); @@ -877,8 +879,10 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) fio->encrypted_page : fio->page; if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, - __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) + __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) { + f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } trace_f2fs_submit_page_bio(page, fio); @@ -1181,6 +1185,8 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr, DATA_GENERIC_ENHANCE_READ)) { err = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_INVALID_BLKADDR); goto put_err; } goto got_it; @@ -1201,6 +1207,8 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, dn.data_blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_INVALID_BLKADDR); goto put_err; } got_it: @@ -1519,6 +1527,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, if (__is_valid_data_blkaddr(blkaddr) && !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto sync_out; } @@ -1564,6 +1573,8 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, (flag != F2FS_GET_BLOCK_FIEMAP || IS_ENABLED(CONFIG_F2FS_CHECK_FS))) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, + ERROR_CORRUPTED_CLUSTER); goto sync_out; } if (flag == F2FS_GET_BLOCK_BMAP) { @@ -2095,6 +2106,8 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page, if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr, DATA_GENERIC_ENHANCE_READ)) { ret = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_INVALID_BLKADDR); goto out; } } else { @@ -2673,8 +2686,11 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) fio->old_blkaddr = ei.blk + page->index - ei.fofs; if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, - DATA_GENERIC_ENHANCE)) + DATA_GENERIC_ENHANCE)) { + f2fs_handle_error(fio->sbi, + ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } ipu_force = true; fio->need_lock = LOCK_DONE; @@ -2702,6 +2718,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) !f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; + f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR); goto out_writepage; } @@ -3618,6 +3635,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ)) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto fail; } err = f2fs_submit_page_read(inode, page, blkaddr, 0, true); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 4f64965fa5b6..13db21672fb4 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -1043,6 +1043,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, __func__, le16_to_cpu(de->name_len)); set_sbi_flag(sbi, SBI_NEED_FSCK); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_CORRUPTED_DIRENT); goto out; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 429ec0ff1e52..6429d2120e2b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1814,6 +1814,10 @@ struct f2fs_sb_info { struct workqueue_struct *post_read_wq; /* post read workqueue */ + unsigned char errors[MAX_F2FS_ERRORS]; /* error flags */ + spinlock_t error_lock; /* protect errors array */ + bool error_dirty; /* errors of sb is dirty */ + struct kmem_cache *inline_xattr_slab; /* inline xattr entry */ unsigned int inline_xattr_slab_size; /* default inline xattr slab size */ @@ -3597,6 +3601,7 @@ int f2fs_quota_sync(struct super_block *sb, int type); loff_t max_file_blocks(struct inode *inode); void f2fs_quota_off_umount(struct super_block *sb); void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason); +void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f1bb4ad8951f..b3aea1a3a357 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1179,6 +1179,7 @@ static int __read_out_blkaddrs(struct inode *inode, block_t *blkaddr, !f2fs_is_valid_blkaddr(sbi, *blkaddr, DATA_GENERIC_ENHANCE)) { f2fs_put_dnode(&dn); + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; } @@ -1463,6 +1464,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, if (!f2fs_is_valid_blkaddr(sbi, dn->data_blkaddr, DATA_GENERIC_ENHANCE)) { ret = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); break; } @@ -3475,8 +3477,10 @@ static int release_compress_blocks(struct dnode_of_data *dn, pgoff_t count) if (!__is_valid_data_blkaddr(blkaddr)) continue; if (unlikely(!f2fs_is_valid_blkaddr(sbi, blkaddr, - DATA_GENERIC_ENHANCE))) + DATA_GENERIC_ENHANCE))) { + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } } while (count) { @@ -3637,8 +3641,10 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count) if (!__is_valid_data_blkaddr(blkaddr)) continue; if (unlikely(!f2fs_is_valid_blkaddr(sbi, blkaddr, - DATA_GENERIC_ENHANCE))) + DATA_GENERIC_ENHANCE))) { + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } } while (count) { @@ -3910,6 +3916,8 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg) DATA_GENERIC_ENHANCE)) { ret = -EFSCORRUPTED; f2fs_put_dnode(&dn); + f2fs_handle_error(sbi, + ERROR_INVALID_BLKADDR); goto out; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 47746a343444..e210c7f0cb63 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1164,6 +1164,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index) if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, DATA_GENERIC_ENHANCE_READ))) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto put_page; } goto got_it; @@ -1182,6 +1183,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index) if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, DATA_GENERIC_ENHANCE))) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto put_page; } got_it: diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 369f0da75d42..0e26d283e0a7 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -163,6 +163,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) set_sbi_flag(fio.sbi, SBI_NEED_FSCK); f2fs_warn(fio.sbi, "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, run fsck to fix.", __func__, dn->inode->i_ino, dn->data_blkaddr); + f2fs_handle_error(fio.sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; } @@ -418,6 +419,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, set_sbi_flag(F2FS_P_SB(page), SBI_NEED_FSCK); f2fs_warn(F2FS_P_SB(page), "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, run fsck to fix.", __func__, dir->i_ino, dn.data_blkaddr); + f2fs_handle_error(F2FS_P_SB(page), ERROR_INVALID_BLKADDR); err = -EFSCORRUPTED; goto out; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 873e17a2b8e8..e07c376f48db 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -81,8 +81,10 @@ static int __written_first_block(struct f2fs_sb_info *sbi, if (!__is_valid_data_blkaddr(addr)) return 1; - if (!f2fs_is_valid_blkaddr(sbi, addr, DATA_GENERIC_ENHANCE)) + if (!f2fs_is_valid_blkaddr(sbi, addr, DATA_GENERIC_ENHANCE)) { + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } return 0; } @@ -415,6 +417,7 @@ static int do_read_inode(struct inode *inode) if (!sanity_check_inode(inode, node_page)) { f2fs_put_page(node_page, 1); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); return -EFSCORRUPTED; } @@ -510,6 +513,7 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) ret = -EFSCORRUPTED; trace_f2fs_iget_exit(inode, ret); iput(inode); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); return ERR_PTR(ret); } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 328801c73fd3..4d1e23ac426c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -36,6 +36,7 @@ int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: out-of-range nid=%x, run fsck to fix.", __func__, nid); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); return -EFSCORRUPTED; } return 0; @@ -1293,6 +1294,7 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs) if (unlikely(new_ni.blk_addr != NULL_ADDR)) { err = -EFSCORRUPTED; set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto fail; } #endif diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index fb35375c3e21..5991391733b4 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -506,6 +506,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, if (ofs_in_node >= max_addrs) { f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%lu, nid:%u, max:%u", ofs_in_node, dn->inode->i_ino, nid, max_addrs); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUMMARY); return -EFSCORRUPTED; } @@ -636,6 +637,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, inode->i_ino, ofs_of_node(dn.node_page), ofs_of_node(page)); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); goto err; } @@ -648,12 +650,14 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, if (__is_valid_data_blkaddr(src) && !f2fs_is_valid_blkaddr(sbi, src, META_POR)) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto err; } if (__is_valid_data_blkaddr(dest) && !f2fs_is_valid_blkaddr(sbi, dest, META_POR)) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto err; } @@ -712,6 +716,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, f2fs_err(sbi, "Inconsistent dest blkaddr:%u, ino:%lu, ofs:%u", dest, inode->i_ino, dn.ofs_in_node); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, + ERROR_INVALID_BLKADDR); goto err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a69227879442..b17e1999a6de 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -311,6 +311,8 @@ static int __f2fs_commit_atomic_write(struct inode *inode) DATA_GENERIC_ENHANCE)) { f2fs_put_dnode(&dn); ret = -EFSCORRUPTED; + f2fs_handle_error(sbi, + ERROR_INVALID_BLKADDR); goto out; } @@ -3433,6 +3435,7 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) f2fs_warn(sbi, "%s: incorrect segment(%u) type, run fsck to fix.", __func__, segno); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE); goto drop_bio; } @@ -4380,6 +4383,8 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) if (se->type >= NR_PERSISTENT_LOG) { f2fs_err(sbi, "Invalid segment type: %u, segno: %u", se->type, start); + f2fs_handle_error(sbi, + ERROR_INCONSISTENT_SUM_TYPE); return -EFSCORRUPTED; } @@ -4416,6 +4421,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) f2fs_err(sbi, "Wrong journal entry on segno %u", start); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_CORRUPTED_JOURNAL); break; } @@ -4435,6 +4441,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) f2fs_err(sbi, "Invalid segment type: %u, segno: %u", se->type, start); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE); break; } @@ -4466,6 +4473,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) if (sit_valid_blocks[NODE] != valid_node_count(sbi)) { f2fs_err(sbi, "SIT is corrupted node# %u vs %u", sit_valid_blocks[NODE], valid_node_count(sbi)); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_NODE_COUNT); return -EFSCORRUPTED; } @@ -4474,6 +4482,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) f2fs_err(sbi, "SIT is corrupted data# %u %u vs %u", sit_valid_blocks[DATA], sit_valid_blocks[NODE], valid_user_blocks(sbi)); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_BLOCK_COUNT); return -EFSCORRUPTED; } @@ -4624,6 +4633,7 @@ static int sanity_check_curseg(struct f2fs_sb_info *sbi) f2fs_err(sbi, "Current segment has invalid alloc_type:%d", curseg->alloc_type); + f2fs_handle_error(sbi, ERROR_INVALID_CURSEG); return -EFSCORRUPTED; } @@ -4641,6 +4651,7 @@ static int sanity_check_curseg(struct f2fs_sb_info *sbi) "Current segment's next free block offset is inconsistent with bitmap, logtype:%u, segno:%u, type:%u, next_blkoff:%u, blkofs:%u", i, curseg->segno, curseg->alloc_type, curseg->next_blkoff, blkofs); + f2fs_handle_error(sbi, ERROR_INVALID_CURSEG); return -EFSCORRUPTED; } } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 6b605af2b87d..ace02b153038 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -753,6 +753,7 @@ static inline int check_block_count(struct f2fs_sb_info *sbi, f2fs_err(sbi, "Mismatch valid blocks %d vs. %d", GET_SIT_VBLOCKS(raw_sit), valid_blocks); set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SIT); return -EFSCORRUPTED; } @@ -767,6 +768,7 @@ static inline int check_block_count(struct f2fs_sb_info *sbi, f2fs_err(sbi, "Wrong valid blocks %d or segno %u", GET_SIT_VBLOCKS(raw_sit), segno); set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SIT); return -EFSCORRUPTED; } return 0; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index caa5b438491b..a6eb21539f76 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3812,8 +3812,6 @@ void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason) struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); int err; - f2fs_bug_on(sbi, reason >= MAX_STOP_REASON); - f2fs_down_write(&sbi->sb_lock); if (raw_super->s_stop_reason[reason] < ((1 << BITS_PER_BYTE) - 1)) @@ -3823,7 +3821,51 @@ void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason) if (err) f2fs_err(sbi, "f2fs_commit_super fails to record reason:%u err:%d", reason, err); + f2fs_up_write(&sbi->sb_lock); +} +static void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag) +{ + spin_lock(&sbi->error_lock); + if (!test_bit(flag, (unsigned long *)sbi->errors)) { + set_bit(flag, (unsigned long *)sbi->errors); + sbi->error_dirty = true; + } + spin_unlock(&sbi->error_lock); +} + +static bool f2fs_update_errors(struct f2fs_sb_info *sbi) +{ + bool need_update = false; + + spin_lock(&sbi->error_lock); + if (sbi->error_dirty) { + memcpy(F2FS_RAW_SUPER(sbi)->s_errors, sbi->errors, + MAX_F2FS_ERRORS); + sbi->error_dirty = false; + need_update = true; + } + spin_unlock(&sbi->error_lock); + + return need_update; +} + +void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error) +{ + int err; + + f2fs_save_errors(sbi, error); + + f2fs_down_write(&sbi->sb_lock); + + if (!f2fs_update_errors(sbi)) + goto out_unlock; + + err = f2fs_commit_super(sbi, false); + if (err) + f2fs_err(sbi, "f2fs_commit_super fails to record errors:%u, err:%d", + error, err); +out_unlock: f2fs_up_write(&sbi->sb_lock); } @@ -4167,6 +4209,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) goto free_devices; } + spin_lock_init(&sbi->error_lock); + memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS); + sbi->total_valid_node_count = le32_to_cpu(sbi->ckpt->valid_node_count); percpu_counter_set(&sbi->total_valid_inode_count, diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index b751f528e5e5..b36441478c8d 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -245,6 +245,8 @@ static int f2fs_get_verity_descriptor(struct inode *inode, void *buf, if (pos + size < pos || pos + size > inode->i_sb->s_maxbytes || pos < f2fs_verity_metadata_pos(inode) || size > INT_MAX) { f2fs_warn(F2FS_I_SB(inode), "invalid verity xattr"); + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_VERITY_XATTR); return -EFSCORRUPTED; } if (buf_size) { diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 1cd844a8e38a..25f163202b49 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -365,6 +365,8 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); err = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); goto out; } check: @@ -581,6 +583,8 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); error = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); goto cleanup; } @@ -656,6 +660,8 @@ static int __f2fs_setxattr(struct inode *inode, int index, inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); error = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); goto exit; } @@ -682,6 +688,8 @@ static int __f2fs_setxattr(struct inode *inode, int index, inode->i_ino, ENTRY_SIZE(last)); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); error = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); goto exit; } last = XATTR_NEXT_ENTRY(last); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 5dd1e52b8997..ee0d75d9a302 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -87,6 +87,28 @@ enum stop_cp_reason { #define MAX_STOP_REASON 32 +/* detail reason for EFSCORRUPTED */ +enum f2fs_error { + ERROR_CORRUPTED_CLUSTER, + ERROR_FAIL_DECOMPRESSION, + ERROR_INVALID_BLKADDR, + ERROR_CORRUPTED_DIRENT, + ERROR_CORRUPTED_INODE, + ERROR_INCONSISTENT_SUMMARY, + ERROR_INCONSISTENT_FOOTER, + ERROR_INCONSISTENT_SUM_TYPE, + ERROR_CORRUPTED_JOURNAL, + ERROR_INCONSISTENT_NODE_COUNT, + ERROR_INCONSISTENT_BLOCK_COUNT, + ERROR_INVALID_CURSEG, + ERROR_INCONSISTENT_SIT, + ERROR_CORRUPTED_VERITY_XATTR, + ERROR_CORRUPTED_XATTR, + ERROR_MAX, +}; + +#define MAX_F2FS_ERRORS 16 + struct f2fs_super_block { __le32 magic; /* Magic Number */ __le16 major_ver; /* Major Version */ @@ -131,7 +153,8 @@ struct f2fs_super_block { __le16 s_encoding; /* Filename charset encoding */ __le16 s_encoding_flags; /* Filename charset encoding flags */ __u8 s_stop_reason[MAX_STOP_REASON]; /* stop checkpoint reason */ - __u8 reserved[274]; /* valid reserved region */ + __u8 s_errors[MAX_F2FS_ERRORS]; /* reason of image corrupts */ + __u8 reserved[258]; /* valid reserved region */ __le32 crc; /* checksum of superblock */ } __packed; From 485bd449ac1a85ed54b3520948d2b576dd86db08 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 30 Sep 2022 15:48:24 -0700 Subject: [PATCH 497/580] f2fs: allow direct read for zoned device This reverts dbf8e63f48af ("f2fs: remove device type check for direct IO"), and apply the below first version, since it contributed out-of-order DIO writes. For zoned devices, f2fs forbids direct IO and forces buffered IO to serialize write IOs. However, the constraint does not apply to read IOs. Cc: stable@vger.kernel.org Fixes: dbf8e63f48af ("f2fs: remove device type check for direct IO") Signed-off-by: Eunhee Rho Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6429d2120e2b..96ea68dfaf0b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4580,7 +4580,12 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, /* disallow direct IO if any of devices has unaligned blksize */ if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize) return true; - + /* + * for blkzoned device, fallback direct IO to buffered IO, so + * all IOs can be serialized by log-structured write. + */ + if (f2fs_sb_has_blkzoned(sbi) && (rw == WRITE)) + return true; if (f2fs_lfs_mode(sbi) && (rw == WRITE)) { if (block_unaligned_IO(inode, iocb, iter)) return true; From 0edf4b950e3b277c5ad1df15a728bf26fd08da4b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 4 Oct 2022 09:11:33 +0800 Subject: [PATCH 498/580] f2fs: account swapfile inodes In order to check count of opened swapfile inodes. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 ++ fs/f2fs/debug.c | 4 ++++ fs/f2fs/f2fs.h | 9 ++++++++- 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0e03bfe48570..f4d7b24c0e0c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -4225,6 +4225,7 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, if (ret < 0) return ret; + stat_inc_swapfile_inode(inode); set_inode_flag(inode, FI_PIN_FILE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return ret; @@ -4234,6 +4235,7 @@ static void f2fs_swap_deactivate(struct file *file) { struct inode *inode = file_inode(file); + stat_dec_swapfile_inode(inode); clear_inode_flag(inode, FI_PIN_FILE); } #else diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 29cf5b6b2341..7a9dd2319155 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -135,6 +135,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->inline_inode = atomic_read(&sbi->inline_inode); si->inline_dir = atomic_read(&sbi->inline_dir); si->compr_inode = atomic_read(&sbi->compr_inode); + si->swapfile_inode = atomic_read(&sbi->swapfile_inode); si->compr_blocks = atomic64_read(&sbi->compr_blocks); si->append = sbi->im[APPEND_INO].ino_num; si->update = sbi->im[UPDATE_INO].ino_num; @@ -385,6 +386,8 @@ static int stat_show(struct seq_file *s, void *v) si->inline_dir); seq_printf(s, " - Compressed Inode: %u, Blocks: %llu\n", si->compr_inode, si->compr_blocks); + seq_printf(s, " - Swapfile Inode: %u\n", + si->swapfile_inode); seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n", si->orphans, si->append, si->update); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", @@ -607,6 +610,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->inline_dir, 0); atomic_set(&sbi->compr_inode, 0); atomic64_set(&sbi->compr_blocks, 0); + atomic_set(&sbi->swapfile_inode, 0); atomic_set(&sbi->inplace_count, 0); for (i = META_CP; i < META_MAX; i++) atomic_set(&sbi->meta_count[i], 0); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 96ea68dfaf0b..65f657fe625a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1769,6 +1769,7 @@ struct f2fs_sb_info { atomic_t inline_dir; /* # of inline_dentry inodes */ atomic_t compr_inode; /* # of compressed inodes */ atomic64_t compr_blocks; /* # of compressed blocks */ + atomic_t swapfile_inode; /* # of swapfile inodes */ atomic_t max_aw_cnt; /* max # of atomic writes */ unsigned int io_skip_bggc; /* skip background gc for in-flight IO */ unsigned int other_skip_bggc; /* skip background gc for other reasons */ @@ -3920,7 +3921,7 @@ struct f2fs_stat_info { int nr_issued_ckpt, nr_total_ckpt, nr_queued_ckpt; unsigned int cur_ckpt_time, peak_ckpt_time; int inline_xattr, inline_inode, inline_dir, append, update, orphans; - int compr_inode; + int compr_inode, swapfile_inode; unsigned long long compr_blocks; int aw_cnt, max_aw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; @@ -4009,6 +4010,10 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) (atomic64_add(blocks, &F2FS_I_SB(inode)->compr_blocks)) #define stat_sub_compr_blocks(inode, blocks) \ (atomic64_sub(blocks, &F2FS_I_SB(inode)->compr_blocks)) +#define stat_inc_swapfile_inode(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->swapfile_inode)) +#define stat_dec_swapfile_inode(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->swapfile_inode)) #define stat_inc_meta_count(sbi, blkaddr) \ do { \ if (blkaddr < SIT_I(sbi)->sit_base_addr) \ @@ -4093,6 +4098,8 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #define stat_dec_compr_inode(inode) do { } while (0) #define stat_add_compr_blocks(inode, blocks) do { } while (0) #define stat_sub_compr_blocks(inode, blocks) do { } while (0) +#define stat_inc_swapfile_inode(inode) do { } while (0) +#define stat_dec_swapfile_inode(inode) do { } while (0) #define stat_update_max_atomic_write(inode) do { } while (0) #define stat_inc_meta_count(sbi, blkaddr) do { } while (0) #define stat_inc_seg_type(sbi, curseg) do { } while (0) From a36b0c3d9cdf21d1671d8551f78691c837a71608 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 4 Oct 2022 09:41:02 +0800 Subject: [PATCH 499/580] f2fs: change to use atomic_t type form sbi.atomic_files inode_lock[ATOMIC_FILE] was used for protecting sbi->atomic_files, update atomic_files variable's type to atomic_t instead of unsigned int, then inode_lock[ATOMIC_FILE] can be obsoleted. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 3 ++- fs/f2fs/f2fs.h | 11 ++++++++--- fs/f2fs/file.c | 4 +--- fs/f2fs/segment.c | 6 +----- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 7a9dd2319155..a216dcdf6941 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -91,7 +91,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; si->nquota_files = sbi->nquota_files; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; - si->aw_cnt = sbi->atomic_files; + si->aw_cnt = atomic_read(&sbi->atomic_files); si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); si->nr_dio_read = get_pages(sbi, F2FS_DIO_READ); si->nr_dio_write = get_pages(sbi, F2FS_DIO_WRITE); @@ -611,6 +611,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->compr_inode, 0); atomic64_set(&sbi->compr_blocks, 0); atomic_set(&sbi->swapfile_inode, 0); + atomic_set(&sbi->atomic_files, 0); atomic_set(&sbi->inplace_count, 0); for (i = META_CP; i < META_MAX; i++) atomic_set(&sbi->meta_count[i], 0); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 65f657fe625a..8fb64048e510 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1250,7 +1250,6 @@ enum inode_type { DIR_INODE, /* for dirty dir inode */ FILE_INODE, /* for dirty regular/symlink inode */ DIRTY_META, /* for all dirtied inode metadata */ - ATOMIC_FILE, /* for all atomic files */ NR_INODE_TYPE, }; @@ -1738,7 +1737,6 @@ struct f2fs_sb_info { unsigned int gc_urgent_high_remaining; /* remaining trial count for GC_URGENT_HIGH */ /* for skip statistic */ - unsigned int atomic_files; /* # of opened atomic file */ unsigned long long skipped_gc_rwsem; /* FG_GC only */ /* threshold for gc trials on pinned files */ @@ -1770,6 +1768,7 @@ struct f2fs_sb_info { atomic_t compr_inode; /* # of compressed inodes */ atomic64_t compr_blocks; /* # of compressed blocks */ atomic_t swapfile_inode; /* # of swapfile inodes */ + atomic_t atomic_files; /* # of opened atomic file */ atomic_t max_aw_cnt; /* max # of atomic writes */ unsigned int io_skip_bggc; /* skip background gc for in-flight IO */ unsigned int other_skip_bggc; /* skip background gc for other reasons */ @@ -4014,6 +4013,10 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) (atomic_inc(&F2FS_I_SB(inode)->swapfile_inode)) #define stat_dec_swapfile_inode(inode) \ (atomic_dec(&F2FS_I_SB(inode)->swapfile_inode)) +#define stat_inc_atomic_inode(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->atomic_files)) +#define stat_dec_atomic_inode(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->atomic_files)) #define stat_inc_meta_count(sbi, blkaddr) \ do { \ if (blkaddr < SIT_I(sbi)->sit_base_addr) \ @@ -4033,7 +4036,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) (atomic_inc(&(sbi)->inplace_count)) #define stat_update_max_atomic_write(inode) \ do { \ - int cur = F2FS_I_SB(inode)->atomic_files; \ + int cur = atomic_read(&F2FS_I_SB(inode)->atomic_files); \ int max = atomic_read(&F2FS_I_SB(inode)->max_aw_cnt); \ if (cur > max) \ atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ @@ -4100,6 +4103,8 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #define stat_sub_compr_blocks(inode, blocks) do { } while (0) #define stat_inc_swapfile_inode(inode) do { } while (0) #define stat_dec_swapfile_inode(inode) do { } while (0) +#define stat_inc_atomic_inode(inode) do { } while (0) +#define stat_dec_atomic_inode(inode) do { } while (0) #define stat_update_max_atomic_write(inode) do { } while (0) #define stat_inc_meta_count(sbi, blkaddr) do { } while (0) #define stat_inc_seg_type(sbi, curseg) do { } while (0) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b3aea1a3a357..1c0d96596acc 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2130,9 +2130,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) } f2fs_i_size_write(fi->cow_inode, i_size_read(inode)); - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - sbi->atomic_files++; - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + stat_inc_atomic_inode(inode); set_inode_flag(inode, FI_ATOMIC_FILE); set_inode_flag(fi->cow_inode, FI_COW_FILE); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b17e1999a6de..2ddc48b886de 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -186,7 +186,6 @@ bool f2fs_need_SSR(struct f2fs_sb_info *sbi) void f2fs_abort_atomic_write(struct inode *inode, bool clean) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); if (!f2fs_is_atomic_file(inode)) @@ -199,10 +198,7 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean) fi->cow_inode = NULL; release_atomic_write_cnt(inode); clear_inode_flag(inode, FI_ATOMIC_FILE); - - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - sbi->atomic_files--; - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + stat_dec_atomic_inode(inode); } static int __replace_atomic_write_block(struct inode *inode, pgoff_t index, From ac015cb32643380e15bd5c4a541c1fd1f1ec293a Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Mon, 24 Oct 2022 19:30:12 +0200 Subject: [PATCH 500/580] f2fs: should put a page when checking the summary info The commit introduces another bug. Cc: stable@vger.kernel.org Fixes: c6ad7fd16657e ("f2fs: fix to do sanity check on summary info") Signed-off-by: Pavel Machek Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index e210c7f0cb63..03c32405ed3b 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1110,6 +1110,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (ofs_in_node >= max_addrs) { f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%u, nid:%u, max:%u", ofs_in_node, dni->ino, dni->nid, max_addrs); + f2fs_put_page(node_page, 1); return false; } From 295ae14fe969715431c8639ff0d49df55a69f38a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 17 Oct 2022 17:52:05 -0700 Subject: [PATCH 501/580] f2fs: let's avoid to get cp_rwsem twice by f2fs_evict_inode by d_invalidate f2fs_unlink -> f2fs_lock_op -> d_invalidate -> shrink_dentry_list -> iput_final -> f2fs_evict_inode -> f2fs_lock_op Reviewed-by: Chao Yu Tested-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index b2ebd64f8108..833360f2951d 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -627,6 +627,8 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) goto fail; } f2fs_delete_entry(de, page, dir, inode); + f2fs_unlock_op(sbi); + #ifdef CONFIG_UNICODE /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid @@ -637,8 +639,6 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) if (IS_CASEFOLDED(dir)) d_invalidate(dentry); #endif - f2fs_unlock_op(sbi); - if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); fail: From cd211510039bc286eabbf937f29a25a1c81f1649 Mon Sep 17 00:00:00 2001 From: Zhang Qilong Date: Tue, 18 Oct 2022 10:45:32 +0800 Subject: [PATCH 502/580] f2fs: Fix the race condition of resize flag between resizefs Because the set/clear SBI_IS_RESIZEFS flag not between any locks, In the following case: thread1 thread2 ->ioctl(resizefs) ->set RESIZEFS flag ->ioctl(resizefs) ... ->set RESIZEFS flag ->clear RESIZEFS flag ->resizefs stream # No RESIZEFS flag in the stream Also before freeze_super, the resizefs not started, we should not set the SBI_IS_RESIZEFS flag. So move the set/clear SBI_IS_RESIZEFS flag between the cp_mutex and gc_lock. Fixes: b4b10061ef98 ("f2fs: refactor resize_fs to avoid meta updates in progress") Signed-off-by: Zhang Xiaoxu Signed-off-by: Zhang Qilong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 03c32405ed3b..e497750d356c 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -2133,8 +2133,6 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) if (err) return err; - set_sbi_flag(sbi, SBI_IS_RESIZEFS); - freeze_super(sbi->sb); f2fs_down_write(&sbi->gc_lock); f2fs_down_write(&sbi->cp_global_sem); @@ -2150,6 +2148,7 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) if (err) goto out_err; + set_sbi_flag(sbi, SBI_IS_RESIZEFS); err = free_segment_range(sbi, secs, false); if (err) goto recover_out; @@ -2173,6 +2172,7 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) f2fs_commit_super(sbi, false); } recover_out: + clear_sbi_flag(sbi, SBI_IS_RESIZEFS); if (err) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_err(sbi, "resize_fs failed, should run fsck to repair!"); @@ -2185,6 +2185,5 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) f2fs_up_write(&sbi->cp_global_sem); f2fs_up_write(&sbi->gc_lock); thaw_super(sbi->sb); - clear_sbi_flag(sbi, SBI_IS_RESIZEFS); return err; } From 2600354ba9dd01db7ae253c603c2fc7062900db5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 21 Oct 2022 10:34:22 +0800 Subject: [PATCH 503/580] f2fs: fix to invalidate dcc->f2fs_issue_discard in error path Syzbot reports a NULL pointer dereference issue as below: __refcount_add include/linux/refcount.h:193 [inline] __refcount_inc include/linux/refcount.h:250 [inline] refcount_inc include/linux/refcount.h:267 [inline] get_task_struct include/linux/sched/task.h:110 [inline] kthread_stop+0x34/0x1c0 kernel/kthread.c:703 f2fs_stop_discard_thread+0x3c/0x5c fs/f2fs/segment.c:1638 kill_f2fs_super+0x5c/0x194 fs/f2fs/super.c:4522 deactivate_locked_super+0x70/0xe8 fs/super.c:332 deactivate_super+0xd0/0xd4 fs/super.c:363 cleanup_mnt+0x1f8/0x234 fs/namespace.c:1186 __cleanup_mnt+0x20/0x30 fs/namespace.c:1193 task_work_run+0xc4/0x14c kernel/task_work.c:177 exit_task_work include/linux/task_work.h:38 [inline] do_exit+0x26c/0xbe0 kernel/exit.c:795 do_group_exit+0x60/0xe8 kernel/exit.c:925 __do_sys_exit_group kernel/exit.c:936 [inline] __se_sys_exit_group kernel/exit.c:934 [inline] __wake_up_parent+0x0/0x40 kernel/exit.c:934 __invoke_syscall arch/arm64/kernel/syscall.c:38 [inline] invoke_syscall arch/arm64/kernel/syscall.c:52 [inline] el0_svc_common+0x138/0x220 arch/arm64/kernel/syscall.c:142 do_el0_svc+0x48/0x164 arch/arm64/kernel/syscall.c:206 el0_svc+0x58/0x150 arch/arm64/kernel/entry-common.c:636 el0t_64_sync_handler+0x84/0xf0 arch/arm64/kernel/entry-common.c:654 el0t_64_sync+0x18c/0x190 arch/arm64/kernel/entry.S:581 The root cause of this issue is in error path of f2fs_start_discard_thread(), it missed to invalidate dcc->f2fs_issue_discard, later kthread_stop() may access invalid pointer. Fixes: 4d67490498ac ("f2fs: Don't create discard thread when device doesn't support realtime discard") Reported-by: syzbot+035a381ea1afb63f098d@syzkaller.appspotmail.com Reported-by: syzbot+729c925c2d9fc495ddee@syzkaller.appspotmail.com Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 2ddc48b886de..8f6870e41ded 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2025,8 +2025,10 @@ int f2fs_start_discard_thread(struct f2fs_sb_info *sbi) dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi, "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev)); - if (IS_ERR(dcc->f2fs_issue_discard)) + if (IS_ERR(dcc->f2fs_issue_discard)) { err = PTR_ERR(dcc->f2fs_issue_discard); + dcc->f2fs_issue_discard = NULL; + } return err; } From 82f7c7afdaff2844b89e3d764390fbd49ddbc808 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 6 Oct 2022 23:09:28 +0800 Subject: [PATCH 504/580] f2fs: support fault injection for f2fs_is_valid_blkaddr() This patch supports to inject fault into f2fs_is_valid_blkaddr() to simulate accessing inconsistent data/meta block addressses from caller. Usage: a) echo 262144 > /sys/fs/f2fs//inject_type or b) mount -o fault_type=262144 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 1 + fs/f2fs/checkpoint.c | 5 +++++ fs/f2fs/f2fs.h | 1 + fs/f2fs/super.c | 1 + 4 files changed, 8 insertions(+) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 3a5e309823ba..0936698a71ad 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -199,6 +199,7 @@ fault_type=%d Support configuring fault injection type, should be FAULT_SLAB_ALLOC 0x000008000 FAULT_DQUOT_INIT 0x000010000 FAULT_LOCK_OP 0x000020000 + FAULT_BLKADDR 0x000040000 =================== =========== mode=%s Control block allocation mode which supports "adaptive" and "lfs". In "lfs" mode, there should be no random diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 32828de6f7e2..09654a931054 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -171,6 +171,11 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr, bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { + if (time_to_inject(sbi, FAULT_BLKADDR)) { + f2fs_show_injection_info(sbi, FAULT_BLKADDR); + return false; + } + switch (type) { case META_NAT: break; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8fb64048e510..b4d0ba49e51c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -56,6 +56,7 @@ enum { FAULT_SLAB_ALLOC, FAULT_DQUOT_INIT, FAULT_LOCK_OP, + FAULT_BLKADDR, FAULT_MAX, }; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a6eb21539f76..061629935333 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -59,6 +59,7 @@ const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_SLAB_ALLOC] = "slab alloc", [FAULT_DQUOT_INIT] = "dquot initialize", [FAULT_LOCK_OP] = "lock_op", + [FAULT_BLKADDR] = "invalid blkaddr", }; void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, From 4bca70ec3434c766fe41b12e03f0710bddd24615 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 25 Oct 2022 11:08:31 +0800 Subject: [PATCH 505/580] f2fs: remove batched_trim_sections node commit 377224c47118("f2fs: don't split checkpoint in fstrim") obsolete batch mode and related sysfs entry. Since this testing sysfs node has been deprecated for a long time, let's remove it. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 --- fs/f2fs/sysfs.c | 5 ----- 2 files changed, 8 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b4d0ba49e51c..a811fd99e325 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1061,9 +1061,6 @@ struct f2fs_sm_info { /* a threshold to reclaim prefree segments */ unsigned int rec_prefree_segments; - /* for batched trimming */ - unsigned int trim_sections; /* # of sections to trim */ - struct list_head sit_entry_set; /* sit entry set list */ unsigned int ipu_policy; /* in-place-update policy */ diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index e5191589aa29..307d8ca65f11 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -496,9 +496,6 @@ static ssize_t __sbi_store(struct f2fs_attr *a, return -EINVAL; } - if (!strcmp(a->attr.name, "trim_sections")) - return -EINVAL; - if (!strcmp(a->attr.name, "gc_urgent")) { if (t == 0) { sbi->gc_mode = GC_NORMAL; @@ -790,7 +787,6 @@ F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_ F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); @@ -916,7 +912,6 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(max_discard_issue_time), ATTR_LIST(discard_granularity), ATTR_LIST(pending_discard), - ATTR_LIST(batched_trim_sections), ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), ATTR_LIST(min_fsync_blocks), From 0745a1a456a9e9c2338b76982ce282ebdb91b8e3 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 25 Oct 2022 14:50:24 +0800 Subject: [PATCH 506/580] f2fs: fix gc mode when gc_urgent_high_remaining is 1 Under the current logic, when gc_urgent_high_remaining is set to 1, the mode will be switched to normal at the beginning, instead of running in gc_urgent mode. Let's switch the gc mode back to normal when the gc ends. Fixes: 265576181b4a ("f2fs: remove gc_urgent_high_limited for cleanup") Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index e497750d356c..64980411d592 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -96,16 +96,6 @@ static int gc_thread_func(void *data) * invalidated soon after by user update or deletion. * So, I'd like to wait some time to collect dirty segments. */ - if (sbi->gc_mode == GC_URGENT_HIGH) { - spin_lock(&sbi->gc_urgent_high_lock); - if (sbi->gc_urgent_high_remaining) { - sbi->gc_urgent_high_remaining--; - if (!sbi->gc_urgent_high_remaining) - sbi->gc_mode = GC_NORMAL; - } - spin_unlock(&sbi->gc_urgent_high_lock); - } - if (sbi->gc_mode == GC_URGENT_HIGH || sbi->gc_mode == GC_URGENT_MID) { wait_ms = gc_th->urgent_sleep_time; @@ -162,6 +152,15 @@ static int gc_thread_func(void *data) /* balancing f2fs's metadata periodically */ f2fs_balance_fs_bg(sbi, true); next: + if (sbi->gc_mode == GC_URGENT_HIGH) { + spin_lock(&sbi->gc_urgent_high_lock); + if (sbi->gc_urgent_high_remaining) { + sbi->gc_urgent_high_remaining--; + if (!sbi->gc_urgent_high_remaining) + sbi->gc_mode = GC_NORMAL; + } + spin_unlock(&sbi->gc_urgent_high_lock); + } sb_end_write(sbi->sb); } while (!kthread_should_stop()); From 97c8033578540d6546571358c8657d8556fa73c6 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 25 Oct 2022 16:05:26 +0800 Subject: [PATCH 507/580] f2fs: cleanup in f2fs_create_flush_cmd_control() Just cleanup for readable, no functional changes. Suggested-by: Chao Yu Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 8f6870e41ded..185df1b61aeb 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -619,12 +619,12 @@ int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; struct flush_cmd_control *fcc; - int err = 0; + int err; if (SM_I(sbi)->fcc_info) { fcc = SM_I(sbi)->fcc_info; if (fcc->f2fs_issue_flush) - return err; + return 0; goto init_thread; } @@ -637,7 +637,7 @@ int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi) init_llist_head(&fcc->issue_list); SM_I(sbi)->fcc_info = fcc; if (!test_opt(sbi, FLUSH_MERGE)) - return err; + return 0; init_thread: fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, @@ -649,7 +649,7 @@ int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi) return err; } - return err; + return 0; } void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) From 005389bc43c3fa46d246e9fa3c0cb2b94513b4ac Mon Sep 17 00:00:00 2001 From: Dongdong Zhang Date: Tue, 25 Oct 2022 17:40:36 +0800 Subject: [PATCH 508/580] f2fs: fix normal discard process In the DPOLICY_BG mode, there is a conflict between the two conditions "i + 1 < dpolicy->granularity" and "i < DEFAULT_DISCARD_GRANULARITY". If i = 15, the first condition is false, it will enter the second condition and dispatch all small granularity discards in function __issue_discard_cmd_orderly. The restrictive effect of the first condition to small discards will be invalidated. These two conditions should align. Fixes: 20ee4382322c ("f2fs: issue small discard by LBA order") Signed-off-by: Dongdong Zhang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 185df1b61aeb..65660a5c2b8d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1449,7 +1449,7 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, if (i + 1 < dpolicy->granularity) break; - if (i < DEFAULT_DISCARD_GRANULARITY && dpolicy->ordered) + if (i + 1 < DEFAULT_DISCARD_GRANULARITY && dpolicy->ordered) return __issue_discard_cmd_orderly(sbi, dpolicy); pend_list = &dcc->pend_list[i]; From 60f9e58b4c2c3459b53a86b24f799231b3e71a64 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 25 Oct 2022 01:54:01 +0800 Subject: [PATCH 509/580] f2fs: add barrier mount option This patch adds a mount option, barrier, in f2fs. The barrier option is the opposite of nobarrier. If this option is set, cache_flush commands are allowed to be issued. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 2 ++ fs/f2fs/super.c | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 0936698a71ad..27cca9e4c3df 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -154,6 +154,8 @@ nobarrier This option can be used if underlying storage guarantees If this option is set, no cache_flush commands are issued but f2fs still guarantees the write ordering of all the data writes. +barrier If this option is set, cache_flush commands are allowed to be + issued. fastboot This option is used when a system wants to reduce mount time as much as possible, even though normal performance can be sacrificed. diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 061629935333..535fa608508b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -109,6 +109,7 @@ enum { Opt_noinline_dentry, Opt_flush_merge, Opt_noflush_merge, + Opt_barrier, Opt_nobarrier, Opt_fastboot, Opt_extent_cache, @@ -184,6 +185,7 @@ static match_table_t f2fs_tokens = { {Opt_noinline_dentry, "noinline_dentry"}, {Opt_flush_merge, "flush_merge"}, {Opt_noflush_merge, "noflush_merge"}, + {Opt_barrier, "barrier"}, {Opt_nobarrier, "nobarrier"}, {Opt_fastboot, "fastboot"}, {Opt_extent_cache, "extent_cache"}, @@ -798,6 +800,9 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) case Opt_nobarrier: set_opt(sbi, NOBARRIER); break; + case Opt_barrier: + clear_opt(sbi, NOBARRIER); + break; case Opt_fastboot: set_opt(sbi, FASTBOOT); break; @@ -1924,6 +1929,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",flush_merge"); if (test_opt(sbi, NOBARRIER)) seq_puts(seq, ",nobarrier"); + else + seq_puts(seq, ",barrier"); if (test_opt(sbi, FASTBOOT)) seq_puts(seq, ",fastboot"); if (test_opt(sbi, EXTENT_CACHE)) From ec78dcf2d2a2d4aaa3c8ebb6c6d55d5237e31f3a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 24 Oct 2022 16:00:35 -0700 Subject: [PATCH 510/580] f2fs: allow to set compression for inlined file The below commit disallows to set compression on empty created file which has a inline_data. Let's fix it. Fixes: 7165841d578e ("f2fs: fix to check inline_data during compressed inode conversion") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 1c0d96596acc..0b2510624f72 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1891,6 +1891,10 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) if (!f2fs_disable_compressed_file(inode)) return -EINVAL; } else { + /* try to convert inline_data to support compression */ + int err = f2fs_convert_inline_inode(inode); + if (err) + return err; if (!f2fs_may_compress(inode)) return -EINVAL; if (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode)) From 54836ca3c40e780ebd12cfaf4b5aa434363cc7af Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 25 Oct 2022 16:32:26 +0800 Subject: [PATCH 511/580] f2fs: introduce max_ordered_discard sysfs node The current max_ordered_discard is a fixed value, change it to be configurable through the sys node. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 6 ++++++ fs/f2fs/f2fs.h | 3 +++ fs/f2fs/segment.c | 3 ++- fs/f2fs/sysfs.c | 11 +++++++++++ 4 files changed, 22 insertions(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index e7b5c1fd661e..73bd910ce4dd 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -96,6 +96,12 @@ Description: Controls the issue rate of discard commands that consist of small checkpoint is triggered, and issued during the checkpoint. By default, it is disabled with 0. +What: /sys/fs/f2fs//max_ordered_discard +Date: October 2022 +Contact: "Yangtao Li" +Description: Controls the maximum ordered discard, the unit size is one block(4KB). + Set it to 16 by default. + What: /sys/fs/f2fs//max_discard_request Date: December 2021 Contact: "Konstantin Vyshetsky" diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a811fd99e325..b0cfedd18c1f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -327,6 +327,8 @@ struct discard_entry { /* default discard granularity of inner discard thread, unit: block count */ #define DEFAULT_DISCARD_GRANULARITY 16 +/* default maximum discard granularity of ordered discard, unit: block count */ +#define DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY 16 /* max discard pend list number */ #define MAX_PLIST_NUM 512 @@ -406,6 +408,7 @@ struct discard_cmd_control { unsigned int mid_discard_issue_time; /* mid. interval between discard issue */ unsigned int max_discard_issue_time; /* max. interval between discard issue */ unsigned int discard_granularity; /* discard granularity */ + unsigned int max_ordered_discard; /* maximum discard granularity issued by lba order */ unsigned int undiscard_blks; /* # of undiscard blocks */ unsigned int next_pos; /* next discard position */ atomic_t issued_discard; /* # of issued discard */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 65660a5c2b8d..6d8c05c84f94 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1449,7 +1449,7 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, if (i + 1 < dpolicy->granularity) break; - if (i + 1 < DEFAULT_DISCARD_GRANULARITY && dpolicy->ordered) + if (i + 1 < dcc->max_ordered_discard && dpolicy->ordered) return __issue_discard_cmd_orderly(sbi, dpolicy); pend_list = &dcc->pend_list[i]; @@ -2048,6 +2048,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) return -ENOMEM; dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; + dcc->max_ordered_discard = DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY; if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT) dcc->discard_granularity = sbi->blocks_per_seg; else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 307d8ca65f11..aa9ec5cae23b 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -491,6 +491,15 @@ static ssize_t __sbi_store(struct f2fs_attr *a, return count; } + if (!strcmp(a->attr.name, "max_ordered_discard")) { + if (t == 0 || t > MAX_PLIST_NUM) + return -EINVAL; + if (!f2fs_block_unit_discard(sbi)) + return -EINVAL; + *ui = t; + return count; + } + if (!strcmp(a->attr.name, "migration_granularity")) { if (t == 0 || t > sbi->segs_per_sec) return -EINVAL; @@ -786,6 +795,7 @@ F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, min_discard_issue_time, min_discard_ F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_issue_time); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_ordered_discard, max_ordered_discard); F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); @@ -911,6 +921,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(mid_discard_issue_time), ATTR_LIST(max_discard_issue_time), ATTR_LIST(discard_granularity), + ATTR_LIST(max_ordered_discard), ATTR_LIST(pending_discard), ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), From b01f8d9aaabed20f09414d17a8d5ba2f188de43b Mon Sep 17 00:00:00 2001 From: Keoseong Park Date: Thu, 27 Oct 2022 20:01:05 +0900 Subject: [PATCH 512/580] f2fs: Fix typo in comments Change "truncateion" to "truncation". Signed-off-by: Keoseong Park Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 0b2510624f72..871e70706d9a 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -590,7 +590,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) raw_node = F2FS_NODE(dn->node_page); addr = blkaddr_in_node(raw_node) + base + ofs; - /* Assumption: truncateion starts with cluster */ + /* Assumption: truncation starts with cluster */ for (; count > 0; count--, addr++, dn->ofs_in_node++, cluster_index++) { block_t blkaddr = le32_to_cpu(*addr); From 0bee4353a760bc22bfdc71ef079d6157a1a52a8e Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Thu, 27 Oct 2022 18:24:46 +0800 Subject: [PATCH 513/580] f2fs: fix return val in f2fs_start_ckpt_thread() Return PTR_ERR(cprc->f2fs_issue_ckpt) instead of -ENOMEM; Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 +++- fs/f2fs/gc.c | 15 +++++++-------- fs/f2fs/segment.c | 4 ++-- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 09654a931054..ed28df0e19d3 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1902,8 +1902,10 @@ int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi) cprc->f2fs_issue_ckpt = kthread_run(issue_checkpoint_thread, sbi, "f2fs_ckpt-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(cprc->f2fs_issue_ckpt)) { + int err = PTR_ERR(cprc->f2fs_issue_ckpt); + cprc->f2fs_issue_ckpt = NULL; - return -ENOMEM; + return err; } set_task_ioprio(cprc->f2fs_issue_ckpt, cprc->ckpt_thread_ioprio); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 64980411d592..d07b04d4a592 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -171,13 +171,10 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) { struct f2fs_gc_kthread *gc_th; dev_t dev = sbi->sb->s_bdev->bd_dev; - int err = 0; gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL); - if (!gc_th) { - err = -ENOMEM; - goto out; - } + if (!gc_th) + return -ENOMEM; gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME; gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME; @@ -192,12 +189,14 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(gc_th->f2fs_gc_task)) { - err = PTR_ERR(gc_th->f2fs_gc_task); + int err = PTR_ERR(gc_th->f2fs_gc_task); + kfree(gc_th); sbi->gc_thread = NULL; + return err; } -out: - return err; + + return 0; } void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6d8c05c84f94..5e4a17bfd4e7 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -619,7 +619,6 @@ int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; struct flush_cmd_control *fcc; - int err; if (SM_I(sbi)->fcc_info) { fcc = SM_I(sbi)->fcc_info; @@ -643,7 +642,8 @@ int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi) fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { - err = PTR_ERR(fcc->f2fs_issue_flush); + int err = PTR_ERR(fcc->f2fs_issue_flush); + kfree(fcc); SM_I(sbi)->fcc_info = NULL; return err; From 83cea6dcd4a7f00dc7d92734bafc55f984ac91b8 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 28 Oct 2022 17:30:26 +0800 Subject: [PATCH 514/580] f2fs: fix to destroy sbi->post_read_wq in error path of f2fs_fill_super() In error path of f2fs_fill_super(), this patch fixes to call f2fs_destroy_post_read_wq() once if we fail in f2fs_start_ckpt_thread(). Fixes: 261eeb9c1585 ("f2fs: introduce checkpoint_merge mount option") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 535fa608508b..097c8ece5a4c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4474,9 +4474,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) f2fs_destroy_node_manager(sbi); free_sm: f2fs_destroy_segment_manager(sbi); - f2fs_destroy_post_read_wq(sbi); stop_ckpt_thread: f2fs_stop_ckpt_thread(sbi); + f2fs_destroy_post_read_wq(sbi); free_devices: destroy_device_list(sbi); kvfree(sbi->ckpt); From f45617c5a5cdde15de3288e8f01bc2c7ea70253f Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 25 Oct 2022 11:32:16 +0800 Subject: [PATCH 515/580] f2fs: introduce gc_mode sysfs node Revert "f2fs: make gc_urgent and gc_segment_mode sysfs node readable". Add a gc_mode sysfs node to show the current gc_mode as a string. Signed-off-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 6 ++++++ fs/f2fs/f2fs.h | 1 + fs/f2fs/sysfs.c | 12 ++++++++---- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 73bd910ce4dd..40d83b851533 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -615,3 +615,9 @@ Date: July 2022 Contact: "Daeho Jeong" Description: Show the accumulated total revoked atomic write block count after boot. If you write "0" here, you can initialize to "0". + +What: /sys/fs/f2fs//gc_mode +Date: October 2022 +Contact: "Yangtao Li" +Description: Show the current gc_mode as a string. + This is a read-only entry. diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b0cfedd18c1f..9d50479fe653 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1312,6 +1312,7 @@ enum { MAX_TIME, }; +/* Note that you need to keep synchronization with this gc_mode_names array */ enum { GC_NORMAL, GC_IDLE_CB, diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index aa9ec5cae23b..dc82becd8efe 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -143,6 +143,12 @@ static ssize_t pending_discard_show(struct f2fs_attr *a, &SM_I(sbi)->dcc_info->discard_cmd_cnt)); } +static ssize_t gc_mode_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sprintf(buf, "%s\n", gc_mode_names[sbi->gc_mode]); +} + static ssize_t features_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -340,10 +346,6 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, return snprintf(buf, PAGE_SIZE, "%s\n", gc_mode_names[sbi->gc_mode]); - if (!strcmp(a->attr.name, "gc_segment_mode")) - return snprintf(buf, PAGE_SIZE, "%s\n", - gc_mode_names[sbi->gc_segment_mode]); - if (!strcmp(a->attr.name, "gc_reclaimed_segments")) { return snprintf(buf, PAGE_SIZE, "%u\n", sbi->gc_reclaimed_segs[sbi->gc_segment_mode]); @@ -844,6 +846,7 @@ F2FS_GENERAL_RO_ATTR(encoding); F2FS_GENERAL_RO_ATTR(mounted_time_sec); F2FS_GENERAL_RO_ATTR(main_blkaddr); F2FS_GENERAL_RO_ATTR(pending_discard); +F2FS_GENERAL_RO_ATTR(gc_mode); #ifdef CONFIG_F2FS_STAT_FS F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_foreground_calls, cp_count); F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_background_calls, bg_cp_count); @@ -923,6 +926,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(discard_granularity), ATTR_LIST(max_ordered_discard), ATTR_LIST(pending_discard), + ATTR_LIST(gc_mode), ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), ATTR_LIST(min_fsync_blocks), From 63c7fe933dad91a0cfa44c126e2ff5c2ba41c004 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 28 Oct 2022 10:07:13 -0700 Subject: [PATCH 516/580] f2fs: add missing bracket in doc Let's add missing <>. Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 40d83b851533..fa5936f8ffbb 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -238,7 +238,7 @@ Description: Shows total written kbytes issued to disk. What: /sys/fs/f2fs//features Date: July 2017 Contact: "Jaegeuk Kim" -Description: /feature_list/ +Description: /feature_list/> Shows all enabled features in current device. Supported features: encryption, blkzoned, extra_attr, projquota, inode_checksum, From 3056a751c095b65a01639b50f4a9736f2bead060 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 25 Oct 2022 14:50:25 +0800 Subject: [PATCH 517/580] f2fs: replace gc_urgent_high_remaining with gc_remaining_trials The user can set the trial count limit for GC urgent and idle mode with replaced gc_remaining_trials.. If GC thread gets to the limit, the mode will turn back to GC normal mode finally. It was applied only to GC_URGENT, while this patch expands it for GC_IDLE. Signed-off-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 8 ++++---- fs/f2fs/f2fs.h | 5 +++-- fs/f2fs/gc.c | 12 ++++++------ fs/f2fs/super.c | 2 +- fs/f2fs/sysfs.c | 12 ++++++------ 5 files changed, 20 insertions(+), 19 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index fa5936f8ffbb..b73b23f34704 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -579,10 +579,10 @@ Description: With "mode=fragment:block" mount options, we can scatter block allo in the length of 1.. by turns. This value can be set between 1..512 and the default value is 4. -What: /sys/fs/f2fs//gc_urgent_high_remaining -Date: December 2021 -Contact: "Daeho Jeong" -Description: You can set the trial count limit for GC urgent high mode with this value. +What: /sys/fs/f2fs//gc_remaining_trials +Date: October 2022 +Contact: "Yangtao Li" +Description: You can set the trial count limit for GC urgent and idle mode with this value. If GC thread gets to the limit, the mode will turn back to GC normal mode. By default, the value is zero, which means there is no limit like before. diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9d50479fe653..f3a58d6327c4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1735,8 +1735,9 @@ struct f2fs_sb_info { unsigned int cur_victim_sec; /* current victim section num */ unsigned int gc_mode; /* current GC state */ unsigned int next_victim_seg[2]; /* next segment in victim section */ - spinlock_t gc_urgent_high_lock; - unsigned int gc_urgent_high_remaining; /* remaining trial count for GC_URGENT_HIGH */ + spinlock_t gc_remaining_trials_lock; + /* remaining trial count for GC_URGENT_* and GC_IDLE_* */ + unsigned int gc_remaining_trials; /* for skip statistic */ unsigned long long skipped_gc_rwsem; /* FG_GC only */ diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d07b04d4a592..d767bb7cedba 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -152,14 +152,14 @@ static int gc_thread_func(void *data) /* balancing f2fs's metadata periodically */ f2fs_balance_fs_bg(sbi, true); next: - if (sbi->gc_mode == GC_URGENT_HIGH) { - spin_lock(&sbi->gc_urgent_high_lock); - if (sbi->gc_urgent_high_remaining) { - sbi->gc_urgent_high_remaining--; - if (!sbi->gc_urgent_high_remaining) + if (sbi->gc_mode != GC_NORMAL) { + spin_lock(&sbi->gc_remaining_trials_lock); + if (sbi->gc_remaining_trials) { + sbi->gc_remaining_trials--; + if (!sbi->gc_remaining_trials) sbi->gc_mode = GC_NORMAL; } - spin_unlock(&sbi->gc_urgent_high_lock); + spin_unlock(&sbi->gc_remaining_trials_lock); } sb_end_write(sbi->sb); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 097c8ece5a4c..dd7ddfe26586 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3591,7 +3591,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->migration_granularity = sbi->segs_per_sec; sbi->max_fragment_chunk = DEF_FRAGMENT_SIZE; sbi->max_fragment_hole = DEF_FRAGMENT_SIZE; - spin_lock_init(&sbi->gc_urgent_high_lock); + spin_lock_init(&sbi->gc_remaining_trials_lock); atomic64_set(&sbi->current_atomic_write, 0); sbi->dir_level = DEF_DIR_LEVEL; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index dc82becd8efe..cdabe424a39e 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -547,10 +547,10 @@ static ssize_t __sbi_store(struct f2fs_attr *a, return count; } - if (!strcmp(a->attr.name, "gc_urgent_high_remaining")) { - spin_lock(&sbi->gc_urgent_high_lock); - sbi->gc_urgent_high_remaining = t; - spin_unlock(&sbi->gc_urgent_high_lock); + if (!strcmp(a->attr.name, "gc_remaining_trials")) { + spin_lock(&sbi->gc_remaining_trials_lock); + sbi->gc_remaining_trials = t; + spin_unlock(&sbi->gc_remaining_trials_lock); return count; } @@ -833,7 +833,7 @@ F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); #endif F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, data_io_flag, data_io_flag); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent_high_remaining, gc_urgent_high_remaining); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_remaining_trials, gc_remaining_trials); F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio); F2FS_GENERAL_RO_ATTR(dirty_segments); F2FS_GENERAL_RO_ATTR(free_segments); @@ -959,7 +959,7 @@ static struct attribute *f2fs_attrs[] = { #endif ATTR_LIST(data_io_flag), ATTR_LIST(node_io_flag), - ATTR_LIST(gc_urgent_high_remaining), + ATTR_LIST(gc_remaining_trials), ATTR_LIST(ckpt_thread_ioprio), ATTR_LIST(dirty_segments), ATTR_LIST(free_segments), From ea7e541aa50e96ef6209c8878da8a82f04edd77e Mon Sep 17 00:00:00 2001 From: "wangkailong@jari.cn" Date: Sat, 29 Oct 2022 22:49:30 +0800 Subject: [PATCH 518/580] f2fs: replace ternary operator with max() Fix the following coccicheck warning: ./fs/f2fs/segment.c:877:24-25: WARNING opportunity for max() Signed-off-by: KaiLong Wang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5e4a17bfd4e7..26ff46c0e3c4 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -855,7 +855,7 @@ block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi) } mutex_unlock(&dirty_i->seglist_lock); - unusable = holes[DATA] > holes[NODE] ? holes[DATA] : holes[NODE]; + unusable = max(holes[DATA], holes[NODE]); if (unusable > ovp_holes) return unusable - ovp_holes; return 0; From 7b03f0927be4584de3802918b4bbb894a8a3ed2c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 8 Nov 2022 17:59:34 -0800 Subject: [PATCH 519/580] f2fs: allow to read node block after shutdown If block address is still alive, we should give a valid node block even after shutdown. Otherwise, we can see zero data when reading out a file. Cc: stable@vger.kernel.org Fixes: 83a3bfdb5a8 ("f2fs: indicate shutdown f2fs to allow unmount successfully") Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 4d1e23ac426c..a1984bf22177 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1358,8 +1358,7 @@ static int read_node_page(struct page *page, int op_flags) return err; /* NEW_ADDR can be seen, after cp_error drops some dirty node pages */ - if (unlikely(ni.blk_addr == NULL_ADDR || ni.blk_addr == NEW_ADDR) || - is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)) { + if (unlikely(ni.blk_addr == NULL_ADDR || ni.blk_addr == NEW_ADDR)) { ClearPageUptodate(page); return -ENOENT; } From 6ba8bc0320b6f0b90b23a0a627e2e8d3ca2b683b Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 25 Oct 2022 21:26:38 +0800 Subject: [PATCH 520/580] f2fs: add proc entry to show discard_plist info This patch adds a new proc entry to show discard_plist information in more detail, which is very helpful to know the discard pend list count clearly. Such as: Discard pend list(Show diacrd_cmd count on each entry, .:not exist): 0 390 156 85 67 46 37 26 14 8 17 12 9 9 6 12 11 10 16 5 9 2 4 8 3 4 1 24 3 2 2 5 2 4 5 4 32 3 3 2 3 . 3 3 1 40 . 4 1 3 2 1 2 1 48 1 . 1 1 . 1 1 . 56 . 1 1 1 . 2 . 1 64 1 2 . . . . . . 72 . 1 . . . . . . 80 3 1 . . 1 1 . . 88 1 . . . 1 . . 1 ...... Signed-off-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index cdabe424a39e..9e84c3ce6a27 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -1242,6 +1242,44 @@ static int __maybe_unused victim_bits_seq_show(struct seq_file *seq, return 0; } +static int __maybe_unused discard_plist_seq_show(struct seq_file *seq, + void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + int i, count; + + seq_puts(seq, "Discard pend list(Show diacrd_cmd count on each entry, .:not exist):\n"); + if (!f2fs_realtime_discard_enable(sbi)) + return 0; + + if (dcc) { + mutex_lock(&dcc->cmd_lock); + for (i = 0; i < MAX_PLIST_NUM; i++) { + struct list_head *pend_list; + struct discard_cmd *dc, *tmp; + + if (i % 8 == 0) + seq_printf(seq, " %-3d", i); + count = 0; + pend_list = &dcc->pend_list[i]; + list_for_each_entry_safe(dc, tmp, pend_list, list) + count++; + if (count) + seq_printf(seq, " %7d", count); + else + seq_puts(seq, " ."); + if (i % 8 == 7) + seq_putc(seq, '\n'); + } + seq_putc(seq, '\n'); + mutex_unlock(&dcc->cmd_lock); + } + + return 0; +} + int __init f2fs_init_sysfs(void) { int ret; @@ -1312,6 +1350,8 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) #endif proc_create_single_data("victim_bits", 0444, sbi->s_proc, victim_bits_seq_show, sb); + proc_create_single_data("discard_plist_info", 0444, sbi->s_proc, + discard_plist_seq_show, sb); } return 0; put_feature_list_kobj: @@ -1335,6 +1375,7 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) remove_proc_entry("segment_info", sbi->s_proc); remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry("victim_bits", sbi->s_proc); + remove_proc_entry("discard_plist_info", sbi->s_proc); remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); } From 3a74f579c2ea768e1e66138721b3890c8f98781c Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Mon, 31 Oct 2022 12:24:15 -0700 Subject: [PATCH 521/580] f2fs: correct i_size change for atomic writes We need to make sure i_size doesn't change until atomic write commit is successful and restore it when commit is failed. Signed-off-by: Daeho Jeong Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 8 ++++++++ fs/f2fs/file.c | 18 +++++++++++------- fs/f2fs/inode.c | 5 ++++- fs/f2fs/segment.c | 14 ++++++++++---- 4 files changed, 33 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f3a58d6327c4..b8ec79c99f4a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -764,6 +764,7 @@ enum { FI_COMPRESS_RELEASED, /* compressed blocks were released */ FI_ALIGNED_WRITE, /* enable aligned write */ FI_COW_FILE, /* indicate COW file */ + FI_ATOMIC_COMMITTED, /* indicate atomic commit completed except disk sync */ FI_MAX, /* max flag, never be used */ }; @@ -824,6 +825,7 @@ struct f2fs_inode_info { unsigned int i_cluster_size; /* cluster size */ unsigned int atomic_write_cnt; + loff_t original_i_size; /* original i_size before atomic write */ }; static inline void get_extent_info(struct extent_info *ext, @@ -3084,6 +3086,8 @@ static inline void f2fs_i_blocks_write(struct inode *inode, set_inode_flag(inode, FI_AUTO_RECOVER); } +static inline bool f2fs_is_atomic_file(struct inode *inode); + static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) { bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); @@ -3093,6 +3097,10 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) return; i_size_write(inode, i_size); + + if (f2fs_is_atomic_file(inode)) + return; + f2fs_mark_inode_dirty_sync(inode, true); if (clean || recover) set_inode_flag(inode, FI_AUTO_RECOVER); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 871e70706d9a..80690e9ddb2e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2074,6 +2074,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct inode *pinode; + loff_t isize; int ret; if (!inode_owner_or_capable(inode)) @@ -2132,7 +2133,12 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) f2fs_up_write(&fi->i_gc_rwsem[WRITE]); goto out; } - f2fs_i_size_write(fi->cow_inode, i_size_read(inode)); + + f2fs_write_inode(inode, NULL); + + isize = i_size_read(inode); + fi->original_i_size = isize; + f2fs_i_size_write(fi->cow_inode, isize); stat_inc_atomic_inode(inode); @@ -2169,16 +2175,14 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) if (f2fs_is_atomic_file(inode)) { ret = f2fs_commit_atomic_write(inode); - if (ret) - goto unlock_out; - - ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); if (!ret) - f2fs_abort_atomic_write(inode, false); + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); + + f2fs_abort_atomic_write(inode, ret); } else { ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); } -unlock_out: + inode_unlock(inode); mnt_drop_write_file(filp); return ret; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index e07c376f48db..50220a93a5a2 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -621,9 +621,12 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) ri->i_uid = cpu_to_le32(i_uid_read(inode)); ri->i_gid = cpu_to_le32(i_gid_read(inode)); ri->i_links = cpu_to_le32(inode->i_nlink); - ri->i_size = cpu_to_le64(i_size_read(inode)); ri->i_blocks = cpu_to_le64(SECTOR_TO_BLOCK(inode->i_blocks) + 1); + if (!f2fs_is_atomic_file(inode) || + is_inode_flag_set(inode, FI_ATOMIC_COMMITTED)) + ri->i_size = cpu_to_le64(i_size_read(inode)); + if (et) { read_lock(&et->lock); set_raw_extent(&et->largest, &ri->i_ext); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 26ff46c0e3c4..122fd737a38a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -191,14 +191,18 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean) if (!f2fs_is_atomic_file(inode)) return; - if (clean) - truncate_inode_pages_final(inode->i_mapping); clear_inode_flag(fi->cow_inode, FI_COW_FILE); iput(fi->cow_inode); fi->cow_inode = NULL; release_atomic_write_cnt(inode); + clear_inode_flag(inode, FI_ATOMIC_COMMITTED); clear_inode_flag(inode, FI_ATOMIC_FILE); stat_dec_atomic_inode(inode); + + if (clean) { + truncate_inode_pages_final(inode->i_mapping); + f2fs_i_size_write(inode, fi->original_i_size); + } } static int __replace_atomic_write_block(struct inode *inode, pgoff_t index, @@ -334,10 +338,12 @@ static int __f2fs_commit_atomic_write(struct inode *inode) } out: - if (ret) + if (ret) { sbi->revoked_atomic_block += fi->atomic_write_cnt; - else + } else { sbi->committed_atomic_block += fi->atomic_write_cnt; + set_inode_flag(inode, FI_ATOMIC_COMMITTED); + } __complete_revoke_list(inode, &revoke_list, ret ? true : false); From 658c36ab5104d6ce794a1449eb01f194b1564b27 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 6 Nov 2022 21:25:44 +0800 Subject: [PATCH 522/580] f2fs: fix to avoid accessing uninitialized spinlock syzbot reports a kernel bug: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x1e3/0x2cb lib/dump_stack.c:106 assign_lock_key+0x22a/0x240 kernel/locking/lockdep.c:981 register_lock_class+0x287/0x9b0 kernel/locking/lockdep.c:1294 __lock_acquire+0xe4/0x1f60 kernel/locking/lockdep.c:4934 lock_acquire+0x1a7/0x400 kernel/locking/lockdep.c:5668 __raw_spin_lock include/linux/spinlock_api_smp.h:133 [inline] _raw_spin_lock+0x2a/0x40 kernel/locking/spinlock.c:154 spin_lock include/linux/spinlock.h:350 [inline] f2fs_save_errors fs/f2fs/super.c:3868 [inline] f2fs_handle_error+0x29/0x230 fs/f2fs/super.c:3896 f2fs_iget+0x215/0x4bb0 fs/f2fs/inode.c:516 f2fs_fill_super+0x47d3/0x7b50 fs/f2fs/super.c:4222 mount_bdev+0x26c/0x3a0 fs/super.c:1401 legacy_get_tree+0xea/0x180 fs/fs_context.c:610 vfs_get_tree+0x88/0x270 fs/super.c:1531 do_new_mount+0x289/0xad0 fs/namespace.c:3040 do_mount fs/namespace.c:3383 [inline] __do_sys_mount fs/namespace.c:3591 [inline] __se_sys_mount+0x2e3/0x3d0 fs/namespace.c:3568 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x2b/0x70 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd F2FS-fs (loop1): Failed to read F2FS meta data inode The root cause is if sbi->error_lock may be accessed before its initialization, fix it. Link: https://lore.kernel.org/linux-f2fs-devel/0000000000007edb6605ecbb6442@google.com/T/#u Reported-by: syzbot+40642be9b7e0bb28e0df@syzkaller.appspotmail.com Fixes: 95fa90c9e5a7 ("f2fs: support recording errors into superblock") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index dd7ddfe26586..02e7fc124b6e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4150,6 +4150,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (err) goto free_bio_info; + spin_lock_init(&sbi->error_lock); + memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS); + init_f2fs_rwsem(&sbi->cp_rwsem); init_f2fs_rwsem(&sbi->quota_sem); init_waitqueue_head(&sbi->cp_wait); @@ -4217,9 +4220,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) goto free_devices; } - spin_lock_init(&sbi->error_lock); - memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS); - sbi->total_valid_node_count = le32_to_cpu(sbi->ckpt->valid_node_count); percpu_counter_set(&sbi->total_valid_inode_count, From 5248ad7893ee051a0a5a586c345e48b2bc4a63ac Mon Sep 17 00:00:00 2001 From: Zhiguo Niu Date: Tue, 8 Nov 2022 14:40:38 +0800 Subject: [PATCH 523/580] f2fs: fix atgc bug on issue in 32bits platform There is bug on issue after atgc feature is enabled in 32bits platform as the following log: F2FS-fs (dm-x): inconsistent rbtree, cur(3470333575168) next(3320009719808) ------------[ cut here ]------------ kernel BUG at fs/f2fs/gc.c:602! Internal error: Oops - BUG: 0 [#1] PREEMPT SMP ARM PC is at get_victim_by_default+0x13c0/0x1498 LR is at f2fs_check_rb_tree_consistence+0xc4/0xd4 .... [] (get_victim_by_default) from [] (f2fs_gc+0x220/0x6cc) [] (f2fs_gc) from [] (gc_thread_func+0x2ac/0x708) [] (gc_thread_func) from [] (kthread+0x1a8/0x1b4) [] (kthread) from [] (ret_from_fork+0x14/0x20) the reason is the 64bits key in struct rb_entry has __packed attibute but has not in struct victim_entry, so the wrong key value got by struct rb_entry in f2fs_check_rb_tree_consistence, the following are the memory layouts of struct rb_entry and struct victim_entry in 32bits platform: struct rb_entry { [0] struct rb_node rb_node; union { struct {...}; [12] unsigned long long key; }; } SIZE: 20 struct victim_entry { [0] struct rb_node rb_node; union { struct {...}; [16] struct victim_info vi; }; [32] struct list_head list; } SIZE: 40 This patch fix the inconsistence layout of 64bits key between struct rb_entry and struct victim_entry. Fixes: 093749e296e2 ("f2fs: support age threshold based garbage collection") Signed-off-by: Zhiguo Niu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 19b956c2d697..37e1142edf2c 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -70,7 +70,7 @@ struct victim_entry { struct victim_info vi; /* victim info */ }; struct list_head list; -}; +} __packed; /* * inline functions From 93e5cf073661f6ce87f06e1cbacb3f162e6c5b3f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 8 Nov 2022 22:33:21 +0800 Subject: [PATCH 524/580] f2fs: optimize iteration over sparse directories Wei Chen reports a kernel bug as blew: INFO: task syz-executor.0:29056 blocked for more than 143 seconds. Not tainted 5.15.0-rc5 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:syz-executor.0 state:D stack:14632 pid:29056 ppid: 6574 flags:0x00000004 Call Trace: __schedule+0x4a1/0x1720 schedule+0x36/0xe0 rwsem_down_write_slowpath+0x322/0x7a0 fscrypt_ioctl_set_policy+0x11f/0x2a0 __f2fs_ioctl+0x1a9f/0x5780 f2fs_ioctl+0x89/0x3a0 __x64_sys_ioctl+0xe8/0x140 do_syscall_64+0x34/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae Eric did some investigation on this issue, quoted from reply of Eric: "Well, the quality of this bug report has a lot to be desired (not on upstream kernel, reproducer is full of totally irrelevant stuff, not sent to the mailing list of the filesystem whose disk image is being fuzzed, etc.). But what is going on is that f2fs_empty_dir() doesn't consider the case of a directory with an extremely large i_size on a malicious disk image. Specifically, the reproducer mounts an f2fs image with a directory that has an i_size of 14814520042850357248, then calls FS_IOC_SET_ENCRYPTION_POLICY on it. That results in a call to f2fs_empty_dir() to check whether the directory is empty. f2fs_empty_dir() then iterates through all 3616826182336513 blocks the directory allegedly contains to check whether any contain anything. i_rwsem is held during this, so anything else that tries to take it will hang." In order to solve this issue, let's use f2fs_get_next_page_offset() to speed up iteration by skipping holes for all below functions: - f2fs_empty_dir - f2fs_readdir - find_in_level The way why we can speed up iteration was described in 'commit 3cf4574705b4 ("f2fs: introduce get_next_page_offset to speed up SEEK_DATA")'. Meanwhile, in f2fs_empty_dir(), let's use f2fs_find_data_page() instead f2fs_get_lock_data_page(), due to i_rwsem was held in caller of f2fs_empty_dir(), there shouldn't be any races, so it's fine to not lock dentry page during lookuping dirents in the page. Link: https://lore.kernel.org/lkml/536944df-a0ae-1dd8-148f-510b476e1347@kernel.org/T/ Reported-by: Wei Chen Cc: Eric Biggers Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 17 ++++++++++++----- fs/f2fs/dir.c | 34 ++++++++++++++++++++++++---------- fs/f2fs/f2fs.h | 5 +++-- fs/f2fs/gc.c | 4 ++-- 4 files changed, 41 insertions(+), 19 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f4d7b24c0e0c..f92e0b974cd7 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1168,7 +1168,8 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) } struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, - int op_flags, bool for_write) + int op_flags, bool for_write, + pgoff_t *next_pgofs) { struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; @@ -1194,12 +1195,17 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); - if (err) + if (err) { + if (err == -ENOENT && next_pgofs) + *next_pgofs = f2fs_get_next_page_offset(&dn, index); goto put_err; + } f2fs_put_dnode(&dn); if (unlikely(dn.data_blkaddr == NULL_ADDR)) { err = -ENOENT; + if (next_pgofs) + *next_pgofs = index + 1; goto put_err; } if (dn.data_blkaddr != NEW_ADDR && @@ -1243,7 +1249,8 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, return ERR_PTR(err); } -struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index) +struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index, + pgoff_t *next_pgofs) { struct address_space *mapping = inode->i_mapping; struct page *page; @@ -1253,7 +1260,7 @@ struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index) return page; f2fs_put_page(page, 0); - page = f2fs_get_read_data_page(inode, index, 0, false); + page = f2fs_get_read_data_page(inode, index, 0, false, next_pgofs); if (IS_ERR(page)) return page; @@ -1279,7 +1286,7 @@ struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct page *page; repeat: - page = f2fs_get_read_data_page(inode, index, 0, for_write); + page = f2fs_get_read_data_page(inode, index, 0, for_write, NULL); if (IS_ERR(page)) return page; diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 13db21672fb4..941ba42296db 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -340,6 +340,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, unsigned int bidx, end_block; struct page *dentry_page; struct f2fs_dir_entry *de = NULL; + pgoff_t next_pgofs; bool room = false; int max_slots; @@ -350,12 +351,13 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, le32_to_cpu(fname->hash) % nbucket); end_block = bidx + nblock; - for (; bidx < end_block; bidx++) { + while (bidx < end_block) { /* no need to allocate new dentry pages to all the indices */ - dentry_page = f2fs_find_data_page(dir, bidx); + dentry_page = f2fs_find_data_page(dir, bidx, &next_pgofs); if (IS_ERR(dentry_page)) { if (PTR_ERR(dentry_page) == -ENOENT) { room = true; + bidx = next_pgofs; continue; } else { *res_page = dentry_page; @@ -376,6 +378,8 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, if (max_slots >= s) room = true; f2fs_put_page(dentry_page, 0); + + bidx++; } if (!de && room && F2FS_I(dir)->chash != fname->hash) { @@ -958,7 +962,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, bool f2fs_empty_dir(struct inode *dir) { - unsigned long bidx; + unsigned long bidx = 0; struct page *dentry_page; unsigned int bit_pos; struct f2fs_dentry_block *dentry_blk; @@ -967,13 +971,17 @@ bool f2fs_empty_dir(struct inode *dir) if (f2fs_has_inline_dentry(dir)) return f2fs_empty_inline_dir(dir); - for (bidx = 0; bidx < nblock; bidx++) { - dentry_page = f2fs_get_lock_data_page(dir, bidx, false); + while (bidx < nblock) { + pgoff_t next_pgofs; + + dentry_page = f2fs_find_data_page(dir, bidx, &next_pgofs); if (IS_ERR(dentry_page)) { - if (PTR_ERR(dentry_page) == -ENOENT) + if (PTR_ERR(dentry_page) == -ENOENT) { + bidx = next_pgofs; continue; - else + } else { return false; + } } dentry_blk = page_address(dentry_page); @@ -985,10 +993,12 @@ bool f2fs_empty_dir(struct inode *dir) NR_DENTRY_IN_BLOCK, bit_pos); - f2fs_put_page(dentry_page, 1); + f2fs_put_page(dentry_page, 0); if (bit_pos < NR_DENTRY_IN_BLOCK) return false; + + bidx++; } return true; } @@ -1106,7 +1116,8 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) goto out_free; } - for (; n < npages; n++, ctx->pos = n * NR_DENTRY_IN_BLOCK) { + for (; n < npages; ctx->pos = n * NR_DENTRY_IN_BLOCK) { + pgoff_t next_pgofs; /* allow readdir() to be interrupted */ if (fatal_signal_pending(current)) { @@ -1120,11 +1131,12 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) page_cache_sync_readahead(inode->i_mapping, ra, file, n, min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); - dentry_page = f2fs_find_data_page(inode, n); + dentry_page = f2fs_find_data_page(inode, n, &next_pgofs); if (IS_ERR(dentry_page)) { err = PTR_ERR(dentry_page); if (err == -ENOENT) { err = 0; + n = next_pgofs; continue; } else { goto out_free; @@ -1143,6 +1155,8 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) } f2fs_put_page(dentry_page, 0); + + n++; } out_free: fscrypt_fname_free_buffer(&fstr); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b8ec79c99f4a..d735be33c87c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3848,8 +3848,9 @@ int f2fs_mpage_readpages(struct address_space *mapping, struct list_head *pages, struct page *page, unsigned nr_pages, bool is_readahead); struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, - int op_flags, bool for_write); -struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index); + int op_flags, bool for_write, pgoff_t *next_pgofs); +struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index, + pgoff_t *next_pgofs); struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, bool for_write); struct page *f2fs_get_new_data_page(struct inode *inode, diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d767bb7cedba..61447eb3e52e 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1561,8 +1561,8 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, continue; } - data_page = f2fs_get_read_data_page(inode, - start_bidx, REQ_RAHEAD, true); + data_page = f2fs_get_read_data_page(inode, start_bidx, + REQ_RAHEAD, true, NULL); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (IS_ERR(data_page)) { iput(inode); From 8799a1bc0701d708bf94f41fe50b923a5ce2c9e4 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 9 Nov 2022 07:04:42 +0900 Subject: [PATCH 525/580] f2fs: initialize locks earlier in f2fs_fill_super() syzbot is reporting lockdep warning at f2fs_handle_error() [1], for spin_lock(&sbi->error_lock) is called before spin_lock_init() is called. For safe locking in error handling, move initialization of locks (and obvious structures) in f2fs_fill_super() to immediately after memory allocation. Link: https://syzkaller.appspot.com/bug?extid=40642be9b7e0bb28e0df [1] Reported-by: syzbot Signed-off-by: Tetsuo Handa Tested-by: syzbot Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 02e7fc124b6e..6b35aea0b13d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4057,6 +4057,24 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sbi->sb = sb; + /* initialize locks within allocated memory */ + init_f2fs_rwsem(&sbi->gc_lock); + mutex_init(&sbi->writepages); + init_f2fs_rwsem(&sbi->cp_global_sem); + init_f2fs_rwsem(&sbi->node_write); + init_f2fs_rwsem(&sbi->node_change); + spin_lock_init(&sbi->stat_lock); + init_f2fs_rwsem(&sbi->cp_rwsem); + init_f2fs_rwsem(&sbi->quota_sem); + init_waitqueue_head(&sbi->cp_wait); + spin_lock_init(&sbi->error_lock); + + for (i = 0; i < NR_INODE_TYPE; i++) { + INIT_LIST_HEAD(&sbi->inode_list[i]); + spin_lock_init(&sbi->inode_lock[i]); + } + mutex_init(&sbi->flush_lock); + /* Load the checksum driver */ sbi->s_chksum_driver = crypto_alloc_shash("crc32", 0, 0); if (IS_ERR(sbi->s_chksum_driver)) { @@ -4080,6 +4098,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sb->s_fs_info = sbi; sbi->raw_super = raw_super; + memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS); + /* precompute checksum seed for metadata */ if (f2fs_sb_has_inode_chksum(sbi)) sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid, @@ -4136,26 +4156,14 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) /* init f2fs-specific super block info */ sbi->valid_super_block = valid_super_block; - init_f2fs_rwsem(&sbi->gc_lock); - mutex_init(&sbi->writepages); - init_f2fs_rwsem(&sbi->cp_global_sem); - init_f2fs_rwsem(&sbi->node_write); - init_f2fs_rwsem(&sbi->node_change); /* disallow all the data/node/meta page writes */ set_sbi_flag(sbi, SBI_POR_DOING); - spin_lock_init(&sbi->stat_lock); err = f2fs_init_write_merge_io(sbi); if (err) goto free_bio_info; - spin_lock_init(&sbi->error_lock); - memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS); - - init_f2fs_rwsem(&sbi->cp_rwsem); - init_f2fs_rwsem(&sbi->quota_sem); - init_waitqueue_head(&sbi->cp_wait); init_sb_info(sbi); err = f2fs_init_iostat(sbi); @@ -4233,12 +4241,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) limit_reserve_root(sbi); adjust_unusable_cap_perc(sbi); - for (i = 0; i < NR_INODE_TYPE; i++) { - INIT_LIST_HEAD(&sbi->inode_list[i]); - spin_lock_init(&sbi->inode_lock[i]); - } - mutex_init(&sbi->flush_lock); - f2fs_init_extent_cache_info(sbi); f2fs_init_ino_entry_info(sbi); From c2911131dc5faf08af4c3521136cff26d131c491 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Wed, 26 Oct 2022 21:05:04 +0800 Subject: [PATCH 526/580] f2fs: fix to set flush_merge opt and show noflush_merge Some minor modifications to flush_merge and related parameters: 1.The FLUSH_MERGE opt is set by default only in non-ro mode. 2.When ro and merge are set at the same time, an error is reported. 3.Display noflush_merge mount opt. Signed-off-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 6b35aea0b13d..3a8b4763bcee 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1329,6 +1329,12 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) return -EINVAL; } + if ((f2fs_sb_has_readonly(sbi) || f2fs_readonly(sbi->sb)) + && test_opt(sbi, FLUSH_MERGE)) { + f2fs_err(sbi, "FLUSH_MERGE not compatible with readonly mode"); + return -EINVAL; + } + if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) { f2fs_err(sbi, "Allow to mount readonly mode only"); return -EROFS; @@ -1925,8 +1931,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",inline_dentry"); else seq_puts(seq, ",noinline_dentry"); - if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE)) + if (test_opt(sbi, FLUSH_MERGE)) seq_puts(seq, ",flush_merge"); + else + seq_puts(seq, ",noflush_merge"); if (test_opt(sbi, NOBARRIER)) seq_puts(seq, ",nobarrier"); else @@ -2052,7 +2060,8 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, MERGE_CHECKPOINT); F2FS_OPTION(sbi).unusable_cap = 0; sbi->sb->s_flags |= SB_LAZYTIME; - set_opt(sbi, FLUSH_MERGE); + if (!(f2fs_sb_has_readonly(sbi) || f2fs_readonly(sbi->sb))) + set_opt(sbi, FLUSH_MERGE); if (f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) set_opt(sbi, DISCARD); if (f2fs_sb_has_blkzoned(sbi)) { From 0b6d584ff606c3ff05c35ad8acad20e0e2ae2322 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Fri, 11 Nov 2022 09:04:06 -0800 Subject: [PATCH 527/580] f2fs: introduce F2FS_IOC_START_ATOMIC_REPLACE introduce a new ioctl to replace the whole content of a file atomically, which means it induces truncate and content update at the same time. We can start it with F2FS_IOC_START_ATOMIC_REPLACE and complete it with F2FS_IOC_COMMIT_ATOMIC_WRITE. Or abort it with F2FS_IOC_ABORT_ATOMIC_WRITE. Signed-off-by: Daeho Jeong Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 +++ fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 21 +++++++++++++++------ fs/f2fs/segment.c | 13 ++++++++++++- include/uapi/linux/f2fs.h | 1 + 5 files changed, 32 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f92e0b974cd7..a7fcb564c29e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3520,6 +3520,9 @@ static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi, else if (*blk_addr != NULL_ADDR) return 0; + if (is_inode_flag_set(inode, FI_ATOMIC_REPLACE)) + goto reserve_block; + /* Look for the block in the original inode */ err = __find_data_block(inode, index, &ori_blk_addr); if (err) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d735be33c87c..cf4f1490f74b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -765,6 +765,7 @@ enum { FI_ALIGNED_WRITE, /* enable aligned write */ FI_COW_FILE, /* indicate COW file */ FI_ATOMIC_COMMITTED, /* indicate atomic commit completed except disk sync */ + FI_ATOMIC_REPLACE, /* indicate atomic replace */ FI_MAX, /* max flag, never be used */ }; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 80690e9ddb2e..4828e4ca0d3f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2068,7 +2068,7 @@ static int f2fs_ioc_getversion(struct file *filp, unsigned long arg) return put_user(inode->i_generation, (int __user *)arg); } -static int f2fs_ioc_start_atomic_write(struct file *filp) +static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate) { struct inode *inode = file_inode(filp); struct f2fs_inode_info *fi = F2FS_I(inode); @@ -2136,15 +2136,22 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) f2fs_write_inode(inode, NULL); - isize = i_size_read(inode); - fi->original_i_size = isize; - f2fs_i_size_write(fi->cow_inode, isize); - stat_inc_atomic_inode(inode); set_inode_flag(inode, FI_ATOMIC_FILE); set_inode_flag(fi->cow_inode, FI_COW_FILE); clear_inode_flag(fi->cow_inode, FI_INLINE_DATA); + + isize = i_size_read(inode); + fi->original_i_size = isize; + if (truncate) { + set_inode_flag(inode, FI_ATOMIC_REPLACE); + truncate_inode_pages_final(inode->i_mapping); + f2fs_i_size_write(inode, 0); + isize = 0; + } + f2fs_i_size_write(fi->cow_inode, isize); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); f2fs_update_time(sbi, REQ_TIME); @@ -4246,7 +4253,9 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case FS_IOC_GETVERSION: return f2fs_ioc_getversion(filp, arg); case F2FS_IOC_START_ATOMIC_WRITE: - return f2fs_ioc_start_atomic_write(filp); + return f2fs_ioc_start_atomic_write(filp, false); + case F2FS_IOC_START_ATOMIC_REPLACE: + return f2fs_ioc_start_atomic_write(filp, true); case F2FS_IOC_COMMIT_ATOMIC_WRITE: return f2fs_ioc_commit_atomic_write(filp); case F2FS_IOC_ABORT_ATOMIC_WRITE: diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 122fd737a38a..b763e1bd0803 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -196,6 +196,7 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean) fi->cow_inode = NULL; release_atomic_write_cnt(inode); clear_inode_flag(inode, FI_ATOMIC_COMMITTED); + clear_inode_flag(inode, FI_ATOMIC_REPLACE); clear_inode_flag(inode, FI_ATOMIC_FILE); stat_dec_atomic_inode(inode); @@ -260,14 +261,24 @@ static void __complete_revoke_list(struct inode *inode, struct list_head *head, bool revoke) { struct revoke_entry *cur, *tmp; + pgoff_t start_index = 0; + bool truncate = is_inode_flag_set(inode, FI_ATOMIC_REPLACE); list_for_each_entry_safe(cur, tmp, head, list) { - if (revoke) + if (revoke) { __replace_atomic_write_block(inode, cur->index, cur->old_addr, NULL, true); + } else if (truncate) { + f2fs_truncate_hole(inode, start_index, cur->index); + start_index = cur->index + 1; + } + list_del(&cur->list); kmem_cache_free(revoke_entry_slab, cur); } + + if (!revoke && truncate) + f2fs_do_truncate_blocks(inode, start_index * PAGE_SIZE, false); } static int __f2fs_commit_atomic_write(struct inode *inode) diff --git a/include/uapi/linux/f2fs.h b/include/uapi/linux/f2fs.h index 3121d127d5aa..955d440be104 100644 --- a/include/uapi/linux/f2fs.h +++ b/include/uapi/linux/f2fs.h @@ -42,6 +42,7 @@ struct f2fs_comp_option) #define F2FS_IOC_DECOMPRESS_FILE _IO(F2FS_IOCTL_MAGIC, 23) #define F2FS_IOC_COMPRESS_FILE _IO(F2FS_IOCTL_MAGIC, 24) +#define F2FS_IOC_START_ATOMIC_REPLACE _IO(F2FS_IOCTL_MAGIC, 25) /* * should be same as XFS_IOC_GOINGDOWN. From c765c3502c9ba3e1c62cb49e2536155111fb87b0 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 15 Nov 2022 00:08:47 +0800 Subject: [PATCH 528/580] f2fs: fix to do sanity check on i_extra_isize in is_alive() syzbot found a f2fs bug: BUG: KASAN: slab-out-of-bounds in data_blkaddr fs/f2fs/f2fs.h:2891 [inline] BUG: KASAN: slab-out-of-bounds in is_alive fs/f2fs/gc.c:1117 [inline] BUG: KASAN: slab-out-of-bounds in gc_data_segment fs/f2fs/gc.c:1520 [inline] BUG: KASAN: slab-out-of-bounds in do_garbage_collect+0x386a/0x3df0 fs/f2fs/gc.c:1734 Read of size 4 at addr ffff888076557568 by task kworker/u4:3/52 CPU: 1 PID: 52 Comm: kworker/u4:3 Not tainted 6.1.0-rc4-syzkaller-00362-gfef7fd48922d #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/26/2022 Workqueue: writeback wb_workfn (flush-7:0) Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:284 [inline] print_report+0x15e/0x45d mm/kasan/report.c:395 kasan_report+0xbb/0x1f0 mm/kasan/report.c:495 data_blkaddr fs/f2fs/f2fs.h:2891 [inline] is_alive fs/f2fs/gc.c:1117 [inline] gc_data_segment fs/f2fs/gc.c:1520 [inline] do_garbage_collect+0x386a/0x3df0 fs/f2fs/gc.c:1734 f2fs_gc+0x88c/0x20a0 fs/f2fs/gc.c:1831 f2fs_balance_fs+0x544/0x6b0 fs/f2fs/segment.c:410 f2fs_write_inode+0x57e/0xe20 fs/f2fs/inode.c:753 write_inode fs/fs-writeback.c:1440 [inline] __writeback_single_inode+0xcfc/0x1440 fs/fs-writeback.c:1652 writeback_sb_inodes+0x54d/0xf90 fs/fs-writeback.c:1870 wb_writeback+0x2c5/0xd70 fs/fs-writeback.c:2044 wb_do_writeback fs/fs-writeback.c:2187 [inline] wb_workfn+0x2dc/0x12f0 fs/fs-writeback.c:2227 process_one_work+0x9bf/0x1710 kernel/workqueue.c:2289 worker_thread+0x665/0x1080 kernel/workqueue.c:2436 kthread+0x2e4/0x3a0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:306 The root cause is that we forgot to do sanity check on .i_extra_isize in below path, result in accessing invalid address later, fix it. - gc_data_segment - is_alive - data_blkaddr - offset_in_addr Reported-by: syzbot+f8f3dfa4abc489e768a1@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-f2fs-devel/0000000000003cb3c405ed5c17f9@google.com/T/#u Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 61447eb3e52e..85f079478be4 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1077,7 +1077,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, { struct page *node_page; nid_t nid; - unsigned int ofs_in_node, max_addrs; + unsigned int ofs_in_node, max_addrs, base; block_t source_blkaddr; nid = le32_to_cpu(sum->nid); @@ -1103,11 +1103,17 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return false; } - max_addrs = IS_INODE(node_page) ? DEF_ADDRS_PER_INODE : - DEF_ADDRS_PER_BLOCK; - if (ofs_in_node >= max_addrs) { - f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%u, nid:%u, max:%u", - ofs_in_node, dni->ino, dni->nid, max_addrs); + if (IS_INODE(node_page)) { + base = offset_in_addr(F2FS_INODE(node_page)); + max_addrs = DEF_ADDRS_PER_INODE; + } else { + base = 0; + max_addrs = DEF_ADDRS_PER_BLOCK; + } + + if (base + ofs_in_node >= max_addrs) { + f2fs_err(sbi, "Inconsistent blkaddr offset: base:%u, ofs_in_node:%u, max:%u, ino:%u, nid:%u", + base, ofs_in_node, max_addrs, dni->ino, dni->nid); f2fs_put_page(node_page, 1); return false; } From cd094662a4d5efb7a64ab21541292e69c15d048d Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Sat, 12 Nov 2022 00:13:49 +0800 Subject: [PATCH 529/580] f2fs: remove submit label in __submit_discard_cmd() Complaint from Matthew Wilcox in another similar place: "submit? You don't submit anything at the 'submit' label. it should be called 'skip' or something. But I think this is just badly written and you don't need a goto at all." Let's remove submit label for readability. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b763e1bd0803..3b6e0687bef9 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1143,13 +1143,12 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, if (time_to_inject(sbi, FAULT_DISCARD)) { f2fs_show_injection_info(sbi, FAULT_DISCARD); err = -EIO; - goto submit; - } - err = __blkdev_issue_discard(bdev, + } else { + err = __blkdev_issue_discard(bdev, SECTOR_FROM_BLOCK(start), SECTOR_FROM_BLOCK(len), GFP_NOFS, 0, &bio); -submit: + } if (err) { spin_lock_irqsave(&dc->lock, flags); if (dc->state == D_PARTIAL) From bb1e68a1a9127d1219ab304bd3c044108ca5d877 Mon Sep 17 00:00:00 2001 From: Yuwei Guan Date: Tue, 15 Nov 2022 14:35:35 +0800 Subject: [PATCH 530/580] f2fs: fix to alloc_mode changed after remount on a small volume device The commit 84b89e5d943d8 ("f2fs: add auto tuning for small devices") add tuning for small volume device, now support to tune alloce_mode to 'reuse' if it's small size. But the alloc_mode will change to 'default' when do remount on this small size dievce. This patch fo fix alloc_mode changed when do remount for a small volume device. Signed-off-by: Yuwei Guan Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3a8b4763bcee..cb04a291554d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2040,7 +2040,11 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE; F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; - F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; + if (le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_main) <= + SMALL_VOLUME_SEGMENTS) + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; + else + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); @@ -4031,7 +4035,6 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) /* adjust parameters according to the volume size */ if (sm_i->main_segments <= SMALL_VOLUME_SEGMENTS) { - F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; if (f2fs_block_unit_discard(sbi)) sm_i->dcc_info->discard_granularity = 1; sm_i->ipu_policy = 1 << F2FS_IPU_FORCE | From 7ca58c065b3328f71322e501ece4b5f3a86c8ca3 Mon Sep 17 00:00:00 2001 From: Yuwei Guan Date: Tue, 15 Nov 2022 14:35:36 +0800 Subject: [PATCH 531/580] f2fs: cleanup for 'f2fs_tuning_parameters' function A cleanup patch for 'f2fs_tuning_parameters' function. Signed-off-by: Yuwei Guan Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index cb04a291554d..cd005731ead2 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4031,13 +4031,11 @@ static int f2fs_setup_casefold(struct f2fs_sb_info *sbi) static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) { - struct f2fs_sm_info *sm_i = SM_I(sbi); - /* adjust parameters according to the volume size */ - if (sm_i->main_segments <= SMALL_VOLUME_SEGMENTS) { + if (MAIN_SEGS(sbi) <= SMALL_VOLUME_SEGMENTS) { if (f2fs_block_unit_discard(sbi)) - sm_i->dcc_info->discard_granularity = 1; - sm_i->ipu_policy = 1 << F2FS_IPU_FORCE | + SM_I(sbi)->dcc_info->discard_granularity = 1; + SM_I(sbi)->ipu_policy = 1 << F2FS_IPU_FORCE | 1 << F2FS_IPU_HONOR_OPU_WRITE; } From ef8debc915956803963ddc4e79c4c1ae7cb0ed75 Mon Sep 17 00:00:00 2001 From: Yuwei Guan Date: Tue, 15 Nov 2022 14:35:37 +0800 Subject: [PATCH 532/580] f2fs: change type for 'sbi->readdir_ra' Before this patch, the varibale 'readdir_ra' takes effect if it's equal to '1' or not, so we can change type for it from 'int' to 'bool'. Signed-off-by: Yuwei Guan Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 2 +- fs/f2fs/f2fs.h | 2 +- fs/f2fs/super.c | 2 +- fs/f2fs/sysfs.c | 5 +++++ 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 941ba42296db..dd3dc3bbc7c6 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -1012,7 +1012,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, struct fscrypt_str de_name = FSTR_INIT(NULL, 0); struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode); struct blk_plug plug; - bool readdir_ra = sbi->readdir_ra == 1; + bool readdir_ra = sbi->readdir_ra; bool found_valid_dirent = false; int err = 0; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index cf4f1490f74b..10fdd7974fba 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1697,7 +1697,7 @@ struct f2fs_sb_info { unsigned int total_node_count; /* total node block count */ unsigned int total_valid_node_count; /* valid node block count */ int dir_level; /* directory level */ - int readdir_ra; /* readahead inode in readdir */ + bool readdir_ra; /* readahead inode in readdir */ u64 max_io_bytes; /* max io bytes to merge IOs */ block_t user_block_count; /* # of user blocks */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index cd005731ead2..f99dfa1cb02b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4039,7 +4039,7 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) 1 << F2FS_IPU_HONOR_OPU_WRITE; } - sbi->readdir_ra = 1; + sbi->readdir_ra = true; } static int f2fs_fill_super(struct super_block *sb, void *data, int silent) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 9e84c3ce6a27..54e0bb6c07b1 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -657,6 +657,11 @@ static ssize_t __sbi_store(struct f2fs_attr *a, return count; } + if (!strcmp(a->attr.name, "readdir_ra")) { + sbi->readdir_ra = !!t; + return count; + } + *ui = (unsigned int)t; return count; From 3fda9d562ea17e55acbb2e8a1e220fc0f9af9a4f Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Fri, 11 Nov 2022 18:08:29 +0800 Subject: [PATCH 533/580] f2fs: fix to enable compress for newly created file if extension matches If compress_extension is set, and a newly created file matches the extension, the file could be marked as compression file. However, if inline_data is also enabled, there is no chance to check its extension since f2fs_should_compress() always returns false. This patch moves set_compress_inode(), which do extension check, in f2fs_should_compress() to check extensions before setting inline data flag. Fixes: 7165841d578e ("f2fs: fix to check inline_data during compressed inode conversion") Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/namei.c | 329 ++++++++++++++++++++++++------------------------ 2 files changed, 164 insertions(+), 167 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 10fdd7974fba..1b62ee18c964 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2989,7 +2989,7 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) /* Flags that should be inherited by new inodes from their parent. */ #define F2FS_FL_INHERITED (F2FS_SYNC_FL | F2FS_NODUMP_FL | F2FS_NOATIME_FL | \ F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \ - F2FS_CASEFOLD_FL | F2FS_COMPR_FL | F2FS_NOCOMP_FL) + F2FS_CASEFOLD_FL) /* Flags that are appropriate for regular files (all but dir-specific ones). */ #define F2FS_REG_FLMASK (~(F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \ diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 833360f2951d..868637cb6922 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -22,7 +22,162 @@ #include "acl.h" #include -static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) +static inline int is_extension_exist(const unsigned char *s, const char *sub, + bool tmp_ext) +{ + size_t slen = strlen(s); + size_t sublen = strlen(sub); + int i; + + if (sublen == 1 && *sub == '*') + return 1; + + /* + * filename format of multimedia file should be defined as: + * "filename + '.' + extension + (optional: '.' + temp extension)". + */ + if (slen < sublen + 2) + return 0; + + if (!tmp_ext) { + /* file has no temp extension */ + if (s[slen - sublen - 1] != '.') + return 0; + return !strncasecmp(s + slen - sublen, sub, sublen); + } + + for (i = 1; i < slen - sublen; i++) { + if (s[i] != '.') + continue; + if (!strncasecmp(s + i + 1, sub, sublen)) + return 1; + } + + return 0; +} + +int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, + bool hot, bool set) +{ + __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; + int cold_count = le32_to_cpu(sbi->raw_super->extension_count); + int hot_count = sbi->raw_super->hot_ext_count; + int total_count = cold_count + hot_count; + int start, count; + int i; + + if (set) { + if (total_count == F2FS_MAX_EXTENSION) + return -EINVAL; + } else { + if (!hot && !cold_count) + return -EINVAL; + if (hot && !hot_count) + return -EINVAL; + } + + if (hot) { + start = cold_count; + count = total_count; + } else { + start = 0; + count = cold_count; + } + + for (i = start; i < count; i++) { + if (strcmp(name, extlist[i])) + continue; + + if (set) + return -EINVAL; + + memcpy(extlist[i], extlist[i + 1], + F2FS_EXTENSION_LEN * (total_count - i - 1)); + memset(extlist[total_count - 1], 0, F2FS_EXTENSION_LEN); + if (hot) + sbi->raw_super->hot_ext_count = hot_count - 1; + else + sbi->raw_super->extension_count = + cpu_to_le32(cold_count - 1); + return 0; + } + + if (!set) + return -EINVAL; + + if (hot) { + memcpy(extlist[count], name, strlen(name)); + sbi->raw_super->hot_ext_count = hot_count + 1; + } else { + char buf[F2FS_MAX_EXTENSION][F2FS_EXTENSION_LEN]; + + memcpy(buf, &extlist[cold_count], + F2FS_EXTENSION_LEN * hot_count); + memset(extlist[cold_count], 0, F2FS_EXTENSION_LEN); + memcpy(extlist[cold_count], name, strlen(name)); + memcpy(&extlist[cold_count + 1], buf, + F2FS_EXTENSION_LEN * hot_count); + sbi->raw_super->extension_count = cpu_to_le32(cold_count + 1); + } + return 0; +} + +static void set_compress_new_inode(struct f2fs_sb_info *sbi, struct inode *dir, + struct inode *inode, const unsigned char *name) +{ + __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; + unsigned char (*noext)[F2FS_EXTENSION_LEN] = + F2FS_OPTION(sbi).noextensions; + unsigned char (*ext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).extensions; + unsigned char ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + unsigned char noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; + int i, cold_count, hot_count; + + if (!f2fs_sb_has_compression(sbi)) + return; + + if (S_ISDIR(inode->i_mode)) + goto inherit_comp; + + /* This name comes only from normal files. */ + if (!name) + return; + + /* Don't compress hot files. */ + f2fs_down_read(&sbi->sb_lock); + cold_count = le32_to_cpu(sbi->raw_super->extension_count); + hot_count = sbi->raw_super->hot_ext_count; + for (i = cold_count; i < cold_count + hot_count; i++) + if (is_extension_exist(name, extlist[i], false)) + break; + f2fs_up_read(&sbi->sb_lock); + if (i < (cold_count + hot_count)) + return; + + /* Don't compress unallowed extension. */ + for (i = 0; i < noext_cnt; i++) + if (is_extension_exist(name, noext[i], false)) + return; + + /* Compress wanting extension. */ + for (i = 0; i < ext_cnt; i++) { + if (is_extension_exist(name, ext[i], false)) { + set_compress_context(inode); + return; + } + } +inherit_comp: + /* Inherit the {no-}compression flag in directory */ + if (F2FS_I(dir)->i_flags & F2FS_NOCOMP_FL) { + F2FS_I(inode)->i_flags |= F2FS_NOCOMP_FL; + f2fs_mark_inode_dirty_sync(inode, true); + } else if (F2FS_I(dir)->i_flags & F2FS_COMPR_FL) { + set_compress_context(inode); + } +} + +static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode, + const char *name) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); nid_t ino; @@ -108,12 +263,8 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL) set_inode_flag(inode, FI_PROJ_INHERIT); - if (f2fs_sb_has_compression(sbi)) { - /* Inherit the compression flag in directory */ - if ((F2FS_I(dir)->i_flags & F2FS_COMPR_FL) && - f2fs_may_compress(inode)) - set_compress_context(inode); - } + /* Check compression first. */ + set_compress_new_inode(sbi, dir, inode, name); /* Should enable inline_data after compression set */ if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) @@ -147,40 +298,6 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) return ERR_PTR(err); } -static inline int is_extension_exist(const unsigned char *s, const char *sub, - bool tmp_ext) -{ - size_t slen = strlen(s); - size_t sublen = strlen(sub); - int i; - - if (sublen == 1 && *sub == '*') - return 1; - - /* - * filename format of multimedia file should be defined as: - * "filename + '.' + extension + (optional: '.' + temp extension)". - */ - if (slen < sublen + 2) - return 0; - - if (!tmp_ext) { - /* file has no temp extension */ - if (s[slen - sublen - 1] != '.') - return 0; - return !strncasecmp(s + slen - sublen, sub, sublen); - } - - for (i = 1; i < slen - sublen; i++) { - if (s[i] != '.') - continue; - if (!strncasecmp(s + i + 1, sub, sublen)) - return 1; - } - - return 0; -} - /* * Set file's temperature for hot/cold data separation */ @@ -211,124 +328,6 @@ static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode * file_set_hot(inode); } -int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, - bool hot, bool set) -{ - __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; - int cold_count = le32_to_cpu(sbi->raw_super->extension_count); - int hot_count = sbi->raw_super->hot_ext_count; - int total_count = cold_count + hot_count; - int start, count; - int i; - - if (set) { - if (total_count == F2FS_MAX_EXTENSION) - return -EINVAL; - } else { - if (!hot && !cold_count) - return -EINVAL; - if (hot && !hot_count) - return -EINVAL; - } - - if (hot) { - start = cold_count; - count = total_count; - } else { - start = 0; - count = cold_count; - } - - for (i = start; i < count; i++) { - if (strcmp(name, extlist[i])) - continue; - - if (set) - return -EINVAL; - - memcpy(extlist[i], extlist[i + 1], - F2FS_EXTENSION_LEN * (total_count - i - 1)); - memset(extlist[total_count - 1], 0, F2FS_EXTENSION_LEN); - if (hot) - sbi->raw_super->hot_ext_count = hot_count - 1; - else - sbi->raw_super->extension_count = - cpu_to_le32(cold_count - 1); - return 0; - } - - if (!set) - return -EINVAL; - - if (hot) { - memcpy(extlist[count], name, strlen(name)); - sbi->raw_super->hot_ext_count = hot_count + 1; - } else { - char buf[F2FS_MAX_EXTENSION][F2FS_EXTENSION_LEN]; - - memcpy(buf, &extlist[cold_count], - F2FS_EXTENSION_LEN * hot_count); - memset(extlist[cold_count], 0, F2FS_EXTENSION_LEN); - memcpy(extlist[cold_count], name, strlen(name)); - memcpy(&extlist[cold_count + 1], buf, - F2FS_EXTENSION_LEN * hot_count); - sbi->raw_super->extension_count = cpu_to_le32(cold_count + 1); - } - return 0; -} - -static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode, - const unsigned char *name) -{ - __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; - unsigned char (*noext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).noextensions; - unsigned char (*ext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).extensions; - unsigned char ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; - unsigned char noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; - int i, cold_count, hot_count; - - if (!f2fs_sb_has_compression(sbi) || - F2FS_I(inode)->i_flags & F2FS_NOCOMP_FL || - !f2fs_may_compress(inode) || - (!ext_cnt && !noext_cnt)) - return; - - f2fs_down_read(&sbi->sb_lock); - - cold_count = le32_to_cpu(sbi->raw_super->extension_count); - hot_count = sbi->raw_super->hot_ext_count; - - for (i = cold_count; i < cold_count + hot_count; i++) { - if (is_extension_exist(name, extlist[i], false)) { - f2fs_up_read(&sbi->sb_lock); - return; - } - } - - f2fs_up_read(&sbi->sb_lock); - - for (i = 0; i < noext_cnt; i++) { - if (is_extension_exist(name, noext[i], false)) { - f2fs_disable_compressed_file(inode); - return; - } - } - - if (is_inode_flag_set(inode, FI_COMPRESSED_FILE)) - return; - - for (i = 0; i < ext_cnt; i++) { - if (!is_extension_exist(name, ext[i], false)) - continue; - - /* Do not use inline_data with compression */ - stat_dec_inline_inode(inode); - clear_inode_flag(inode, FI_INLINE_DATA); - set_compress_context(inode); - return; - } -} - static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { @@ -346,15 +345,13 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, if (err) return err; - inode = f2fs_new_inode(dir, mode); + inode = f2fs_new_inode(dir, mode, dentry->d_name.name); if (IS_ERR(inode)) return PTR_ERR(inode); if (!test_opt(sbi, DISABLE_EXT_IDENTIFY)) set_file_temperature(sbi, inode, dentry->d_name.name); - set_compress_inode(sbi, inode, dentry->d_name.name); - inode->i_op = &f2fs_file_inode_operations; inode->i_fop = &f2fs_file_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; @@ -684,7 +681,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, if (err) return err; - inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO); + inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -754,7 +751,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (err) return err; - inode = f2fs_new_inode(dir, S_IFDIR | mode); + inode = f2fs_new_inode(dir, S_IFDIR | mode, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -811,7 +808,7 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, if (err) return err; - inode = f2fs_new_inode(dir, mode); + inode = f2fs_new_inode(dir, mode, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -850,7 +847,7 @@ static int __f2fs_tmpfile(struct inode *dir, if (err) return err; - inode = f2fs_new_inode(dir, mode); + inode = f2fs_new_inode(dir, mode, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); From bdf16ef1baafd64b109185701a7a4d8b5c32918a Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Fri, 11 Nov 2022 18:08:30 +0800 Subject: [PATCH 534/580] f2fs: move set_file_temperature into f2fs_new_inode Since the file name has already passed to f2fs_new_inode(), let's move set_file_temperature() into f2fs_new_inode(). Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 62 +++++++++++++++++++++++-------------------------- 1 file changed, 29 insertions(+), 33 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 868637cb6922..ff4d79110fdc 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -176,6 +176,32 @@ static void set_compress_new_inode(struct f2fs_sb_info *sbi, struct inode *dir, } } +/* + * Set file's temperature for hot/cold data separation + */ +static void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode, + const unsigned char *name) +{ + __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; + int i, cold_count, hot_count; + + f2fs_down_read(&sbi->sb_lock); + cold_count = le32_to_cpu(sbi->raw_super->extension_count); + hot_count = sbi->raw_super->hot_ext_count; + for (i = 0; i < cold_count + hot_count; i++) + if (is_extension_exist(name, extlist[i], true)) + break; + f2fs_up_read(&sbi->sb_lock); + + if (i == cold_count + hot_count) + return; + + if (i < cold_count) + file_set_cold(inode); + else + file_set_hot(inode); +} + static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode, const char *name) { @@ -270,6 +296,9 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode, if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) set_inode_flag(inode, FI_INLINE_DATA); + if (name && !test_opt(sbi, DISABLE_EXT_IDENTIFY)) + set_file_temperature(sbi, inode, name); + stat_inc_inline_xattr(inode); stat_inc_inline_inode(inode); stat_inc_inline_dir(inode); @@ -298,36 +327,6 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode, return ERR_PTR(err); } -/* - * Set file's temperature for hot/cold data separation - */ -static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode, - const unsigned char *name) -{ - __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; - int i, cold_count, hot_count; - - f2fs_down_read(&sbi->sb_lock); - - cold_count = le32_to_cpu(sbi->raw_super->extension_count); - hot_count = sbi->raw_super->hot_ext_count; - - for (i = 0; i < cold_count + hot_count; i++) { - if (is_extension_exist(name, extlist[i], true)) - break; - } - - f2fs_up_read(&sbi->sb_lock); - - if (i == cold_count + hot_count) - return; - - if (i < cold_count) - file_set_cold(inode); - else - file_set_hot(inode); -} - static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { @@ -349,9 +348,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, if (IS_ERR(inode)) return PTR_ERR(inode); - if (!test_opt(sbi, DISABLE_EXT_IDENTIFY)) - set_file_temperature(sbi, inode, dentry->d_name.name); - inode->i_op = &f2fs_file_inode_operations; inode->i_fop = &f2fs_file_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; From 2b88f2ba3bafba365c4b0f86e779ca3682c26c3a Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Sat, 19 Nov 2022 01:40:28 +0800 Subject: [PATCH 535/580] f2fs: fix description about discard_granularity node Let's fix the inconsistency in the text description. Default discard granularity is 16. For small devices, default value is 1. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index b73b23f34704..53749c3cea18 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -135,7 +135,8 @@ Contact: "Chao Yu" Description: Controls discard granularity of inner discard thread. Inner thread will not issue discards with size that is smaller than granularity. The unit size is one block(4KB), now only support configuring - in range of [1, 512]. Default value is 4(=16KB). + in range of [1, 512]. Default value is 16. + For small devices, default value is 1. What: /sys/fs/f2fs//umount_discard_timeout Date: January 2019 From 9abc6cdac145e33d4f14b4a567492daf91e201e4 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Thu, 17 Nov 2022 01:10:45 +0800 Subject: [PATCH 536/580] f2fs: make __queue_discard_cmd() return void Since __queue_discard_cmd() never returns an error, let's make it return void. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 3b6e0687bef9..87f797a855cf 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1359,13 +1359,13 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, } } -static int __queue_discard_cmd(struct f2fs_sb_info *sbi, +static void __queue_discard_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { block_t lblkstart = blkstart; if (!f2fs_bdev_support_discard(bdev)) - return 0; + return; trace_f2fs_queue_discard(bdev, blkstart, blklen); @@ -1377,7 +1377,6 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, mutex_lock(&SM_I(sbi)->dcc_info->cmd_lock); __update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen); mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock); - return 0; } static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, @@ -1776,7 +1775,8 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, } /* For conventional zones, use regular discard if supported */ - return __queue_discard_cmd(sbi, bdev, lblkstart, blklen); + __queue_discard_cmd(sbi, bdev, lblkstart, blklen); + return 0; } #endif @@ -1787,7 +1787,8 @@ static int __issue_discard_async(struct f2fs_sb_info *sbi, if (f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(bdev)) return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); #endif - return __queue_discard_cmd(sbi, bdev, blkstart, blklen); + __queue_discard_cmd(sbi, bdev, blkstart, blklen); + return 0; } static int f2fs_issue_discard(struct f2fs_sb_info *sbi, From 65277c1f29383d54c5c69dca1ef76a452861d5ba Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 23 Nov 2022 06:42:52 +0800 Subject: [PATCH 537/580] f2fs: truncate blocks in batch in __complete_revoke_list() Use f2fs_do_truncate_blocks() to truncate all blocks in-batch in __complete_revoke_list(). Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 87f797a855cf..6fae5f3b93d4 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -261,24 +261,19 @@ static void __complete_revoke_list(struct inode *inode, struct list_head *head, bool revoke) { struct revoke_entry *cur, *tmp; - pgoff_t start_index = 0; bool truncate = is_inode_flag_set(inode, FI_ATOMIC_REPLACE); list_for_each_entry_safe(cur, tmp, head, list) { - if (revoke) { + if (revoke) __replace_atomic_write_block(inode, cur->index, cur->old_addr, NULL, true); - } else if (truncate) { - f2fs_truncate_hole(inode, start_index, cur->index); - start_index = cur->index + 1; - } list_del(&cur->list); kmem_cache_free(revoke_entry_slab, cur); } if (!revoke && truncate) - f2fs_do_truncate_blocks(inode, start_index * PAGE_SIZE, false); + f2fs_do_truncate_blocks(inode, 0, false); } static int __f2fs_commit_atomic_write(struct inode *inode) From 15cee4693895e3fc9a0b97947ae9ca12a3f86c48 Mon Sep 17 00:00:00 2001 From: Yonggil Song Date: Tue, 22 Nov 2022 18:03:20 +0900 Subject: [PATCH 538/580] f2fs: avoid victim selection from previous victim section When f2fs chooses GC victim in large section & LFS mode, next_victim_seg[gc_type] is referenced first. After segment is freed, next_victim_seg[gc_type] has the next segment number. However, next_victim_seg[gc_type] still has the last segment number even after the last segment of section is freed. In this case, when f2fs chooses a victim for the next GC round, the last segment of previous victim section is chosen as a victim. Initialize next_victim_seg[gc_type] to NULL_SEGNO for the last segment in large section. Fixes: e3080b0120a1 ("f2fs: support subsectional garbage collection") Signed-off-by: Yonggil Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 85f079478be4..7efaff5bac90 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1748,8 +1748,9 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, get_valid_blocks(sbi, segno, false) == 0) seg_freed++; - if (__is_large_section(sbi) && segno + 1 < end_segno) - sbi->next_victim_seg[gc_type] = segno + 1; + if (__is_large_section(sbi)) + sbi->next_victim_seg[gc_type] = + (segno + 1 < end_segno) ? segno + 1 : NULL_SEGNO; skip: f2fs_put_page(sum_page, 0); } From 3c437ff575884678525146c29140c64785f1cdb9 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Fri, 18 Nov 2022 11:46:00 +0800 Subject: [PATCH 539/580] f2fs: init discard policy after thread wakeup Under the current logic, after the discard thread wakes up, it will not run according to the expected policy, but will use the expected policy before sleep. Move the strategy selection to after the thread wakes up, so that the running state of the thread meets expectations. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6fae5f3b93d4..f9d8f2cbfba0 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1680,6 +1680,11 @@ static int issue_discard_thread(void *data) set_freezable(); do { + wait_event_interruptible_timeout(*q, + kthread_should_stop() || freezing(current) || + dcc->discard_wake, + msecs_to_jiffies(wait_ms)); + if (sbi->gc_mode == GC_URGENT_HIGH || !f2fs_available_free_memory(sbi, DISCARD_CACHE)) __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); @@ -1687,14 +1692,6 @@ static int issue_discard_thread(void *data) __init_discard_policy(sbi, &dpolicy, DPOLICY_BG, dcc->discard_granularity); - if (!atomic_read(&dcc->discard_cmd_cnt)) - wait_ms = dpolicy.max_interval; - - wait_event_interruptible_timeout(*q, - kthread_should_stop() || freezing(current) || - dcc->discard_wake, - msecs_to_jiffies(wait_ms)); - if (dcc->discard_wake) dcc->discard_wake = 0; @@ -1708,12 +1705,11 @@ static int issue_discard_thread(void *data) continue; if (kthread_should_stop()) return 0; - if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) || + !atomic_read(&dcc->discard_cmd_cnt)) { wait_ms = dpolicy.max_interval; continue; } - if (!atomic_read(&dcc->discard_cmd_cnt)) - continue; sb_start_intwrite(sbi->sb); @@ -1728,6 +1724,8 @@ static int issue_discard_thread(void *data) } else { wait_ms = dpolicy.max_interval; } + if (!atomic_read(&dcc->discard_cmd_cnt)) + wait_ms = dpolicy.max_interval; sb_end_intwrite(sbi->sb); From 89fd0db61192e63c6b893d80dd06908330afc784 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Thu, 24 Nov 2022 00:44:01 +0800 Subject: [PATCH 540/580] f2fs: define MIN_DISCARD_GRANULARITY macro Do cleanup in f2fs_tuning_parameters() and __init_discard_policy(), let's use macro instead of number. Suggested-by: Chao Yu Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/segment.c | 4 ++-- fs/f2fs/super.c | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1b62ee18c964..cbccd25626d3 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -325,6 +325,8 @@ struct discard_entry { unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */ }; +/* minimum discard granularity, unit: block count */ +#define MIN_DISCARD_GRANULARITY 1 /* default discard granularity of inner discard thread, unit: block count */ #define DEFAULT_DISCARD_GRANULARITY 16 /* default maximum discard granularity of ordered discard, unit: block count */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f9d8f2cbfba0..afd759d32c41 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1064,7 +1064,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->sync = false; dpolicy->ordered = true; if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) { - dpolicy->granularity = 1; + dpolicy->granularity = MIN_DISCARD_GRANULARITY; if (atomic_read(&dcc->discard_cmd_cnt)) dpolicy->max_interval = dcc->min_discard_issue_time; @@ -1079,7 +1079,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, } else if (discard_type == DPOLICY_UMOUNT) { dpolicy->io_aware = false; /* we need to issue all to keep CP_TRIMMED_FLAG */ - dpolicy->granularity = 1; + dpolicy->granularity = MIN_DISCARD_GRANULARITY; dpolicy->timeout = true; } } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f99dfa1cb02b..b4dbe0c4f483 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4034,7 +4034,8 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) /* adjust parameters according to the volume size */ if (MAIN_SEGS(sbi) <= SMALL_VOLUME_SEGMENTS) { if (f2fs_block_unit_discard(sbi)) - SM_I(sbi)->dcc_info->discard_granularity = 1; + SM_I(sbi)->dcc_info->discard_granularity = + MIN_DISCARD_GRANULARITY; SM_I(sbi)->ipu_policy = 1 << F2FS_IPU_FORCE | 1 << F2FS_IPU_HONOR_OPU_WRITE; } From d2d99c6536c966fec74938738a2f96a5004af9b6 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Thu, 24 Nov 2022 00:44:02 +0800 Subject: [PATCH 541/580] f2fs: introduce discard_urgent_util sysfs node Through this node, you can control the background discard to run more aggressively or not aggressively when reach the utilization rate of the space. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 8 ++++++++ fs/f2fs/f2fs.h | 1 + fs/f2fs/segment.c | 3 ++- fs/f2fs/sysfs.c | 9 +++++++++ 4 files changed, 20 insertions(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 53749c3cea18..fc0704ff13bd 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -622,3 +622,11 @@ Date: October 2022 Contact: "Yangtao Li" Description: Show the current gc_mode as a string. This is a read-only entry. + +What: /sys/fs/f2fs//discard_urgent_util +Date: November 2022 +Contact: "Yangtao Li" +Description: When space utilization exceeds this, do background DISCARD aggressively. + Does DISCARD forcibly in a period of given min_discard_issue_time when the number + of discards is not 0 and set discard granularity to 1. + Default: 80 diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index cbccd25626d3..0bcefba8bd2a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -409,6 +409,7 @@ struct discard_cmd_control { unsigned int min_discard_issue_time; /* min. interval between discard issue */ unsigned int mid_discard_issue_time; /* mid. interval between discard issue */ unsigned int max_discard_issue_time; /* max. interval between discard issue */ + unsigned int discard_urgent_util; /* utilization which issue discard proactively */ unsigned int discard_granularity; /* discard granularity */ unsigned int max_ordered_discard; /* maximum discard granularity issued by lba order */ unsigned int undiscard_blks; /* # of undiscard blocks */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index afd759d32c41..498301e9147f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1063,7 +1063,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->io_aware = true; dpolicy->sync = false; dpolicy->ordered = true; - if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) { + if (utilization(sbi) > dcc->discard_urgent_util) { dpolicy->granularity = MIN_DISCARD_GRANULARITY; if (atomic_read(&dcc->discard_cmd_cnt)) dpolicy->max_interval = @@ -2079,6 +2079,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) dcc->min_discard_issue_time = DEF_MIN_DISCARD_ISSUE_TIME; dcc->mid_discard_issue_time = DEF_MID_DISCARD_ISSUE_TIME; dcc->max_discard_issue_time = DEF_MAX_DISCARD_ISSUE_TIME; + dcc->discard_urgent_util = DEF_DISCARD_URGENT_UTIL; dcc->undiscard_blks = 0; dcc->next_pos = 0; dcc->root = RB_ROOT_CACHED; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 54e0bb6c07b1..8d16f649f2be 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -502,6 +502,13 @@ static ssize_t __sbi_store(struct f2fs_attr *a, return count; } + if (!strcmp(a->attr.name, "discard_urgent_util")) { + if (t > 100) + return -EINVAL; + *ui = t; + return count; + } + if (!strcmp(a->attr.name, "migration_granularity")) { if (t == 0 || t > sbi->segs_per_sec) return -EINVAL; @@ -801,6 +808,7 @@ F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_request, max_discard_req F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, min_discard_issue_time, min_discard_issue_time); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_issue_time); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_urgent_util, discard_urgent_util); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_ordered_discard, max_ordered_discard); F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); @@ -928,6 +936,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(min_discard_issue_time), ATTR_LIST(mid_discard_issue_time), ATTR_LIST(max_discard_issue_time), + ATTR_LIST(discard_urgent_util), ATTR_LIST(discard_granularity), ATTR_LIST(max_ordered_discard), ATTR_LIST(pending_discard), From d3810b059bf2826b10a093c69b4afd4e0662544a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 28 Nov 2022 10:43:44 +0100 Subject: [PATCH 542/580] f2fs: remove struct segment_allocation default_salloc_ops There is only single instance of these ops, so remove the indirection and call allocate_segment_by_default directly. Signed-off-by: Christoph Hellwig Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 11 ++--------- fs/f2fs/segment.h | 6 ------ 2 files changed, 2 insertions(+), 15 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 498301e9147f..673bc16e0840 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2926,7 +2926,7 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type, return; alloc: old_segno = curseg->segno; - SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true); + allocate_segment_by_default(sbi, type, true); locate_dirty_segment(sbi, old_segno); } @@ -2957,10 +2957,6 @@ void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) f2fs_up_read(&SM_I(sbi)->curseg_lock); } -static const struct segment_allocation default_salloc_ops = { - .allocate_segment = allocate_segment_by_default, -}; - bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc) { @@ -3284,7 +3280,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, get_atssr_segment(sbi, type, se->type, AT_SSR, se->mtime); else - sit_i->s_ops->allocate_segment(sbi, type, false); + allocate_segment_by_default(sbi, type, false); } /* * segment dirty status should be updated after segment allocation, @@ -4269,9 +4265,6 @@ static int build_sit_info(struct f2fs_sb_info *sbi) return -ENOMEM; #endif - /* init SIT information */ - sit_i->s_ops = &default_salloc_ops; - sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr); sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg; sit_i->written_valid_blocks = 0; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index ace02b153038..c2fcc6a8ef06 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -222,10 +222,6 @@ struct sec_entry { unsigned int valid_blocks; /* # of valid blocks in a section */ }; -struct segment_allocation { - void (*allocate_segment)(struct f2fs_sb_info *, int, bool); -}; - #define MAX_SKIP_GC_COUNT 16 struct revoke_entry { @@ -235,8 +231,6 @@ struct revoke_entry { }; struct sit_info { - const struct segment_allocation *s_ops; - block_t sit_base_addr; /* start block address of SIT area */ block_t sit_blocks; /* # of blocks used by SIT area */ block_t written_valid_blocks; /* # of valid blocks in main area */ From f79e85d6aa73030f60bf37f8bb7f544129085734 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 28 Nov 2022 10:43:45 +0100 Subject: [PATCH 543/580] f2fs: open code allocate_segment_by_default allocate_segment_by_default has just two callers, which use very different code pathes inside it based on the force paramter. Just open code the logic in the two callers using a new helper to decided if a new segment should be allocated. Signed-off-by: Christoph Hellwig Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 50 +++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 673bc16e0840..1d5c23df65e4 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2849,31 +2849,20 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type, return 0; } -/* - * flush out current segment and replace it with new segment - * This function should be returned with success, otherwise BUG - */ -static void allocate_segment_by_default(struct f2fs_sb_info *sbi, - int type, bool force) +static bool need_new_seg(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); - if (force) - new_curseg(sbi, type, true); - else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && - curseg->seg_type == CURSEG_WARM_NODE) - new_curseg(sbi, type, false); - else if (curseg->alloc_type == LFS && - is_next_segment_free(sbi, curseg, type) && - likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) - new_curseg(sbi, type, false); - else if (f2fs_need_SSR(sbi) && - get_ssr_segment(sbi, type, SSR, 0)) - change_curseg(sbi, type, true); - else - new_curseg(sbi, type, false); - - stat_inc_seg_type(sbi, curseg); + if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && + curseg->seg_type == CURSEG_WARM_NODE) + return true; + if (curseg->alloc_type == LFS && + is_next_segment_free(sbi, curseg, type) && + likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + return true; + if (!f2fs_need_SSR(sbi) || !get_ssr_segment(sbi, type, SSR, 0)) + return true; + return false; } void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, @@ -2926,7 +2915,8 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type, return; alloc: old_segno = curseg->segno; - allocate_segment_by_default(sbi, type, true); + new_curseg(sbi, type, true); + stat_inc_seg_type(sbi, curseg); locate_dirty_segment(sbi, old_segno); } @@ -3276,11 +3266,19 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, update_sit_entry(sbi, old_blkaddr, -1); if (!__has_curseg_space(sbi, curseg)) { - if (from_gc) + /* + * Flush out current segment and replace it with new segment. + */ + if (from_gc) { get_atssr_segment(sbi, type, se->type, AT_SSR, se->mtime); - else - allocate_segment_by_default(sbi, type, false); + } else { + if (need_new_seg(sbi, type)) + new_curseg(sbi, type, false); + else + change_curseg(sbi, type, true); + stat_inc_seg_type(sbi, curseg); + } } /* * segment dirty status should be updated after segment allocation, From bdc0ba2359e726def4cb64929139a6427e5ef6bd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 28 Nov 2022 10:43:46 +0100 Subject: [PATCH 544/580] f2fs: remove the unused flush argument to change_curseg Signed-off-by: Christoph Hellwig Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1d5c23df65e4..618930fd0a97 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2656,7 +2656,7 @@ bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno) * This function always allocates a used segment(from dirty seglist) by SSR * manner, so it should recover the existing segment information of valid blocks */ -static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush) +static void change_curseg(struct f2fs_sb_info *sbi, int type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -2664,9 +2664,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush) struct f2fs_summary_block *sum_node; struct page *sum_page; - if (flush) - write_sum_page(sbi, curseg->sum_blk, - GET_SUM_BLOCK(sbi, curseg->segno)); + write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, curseg->segno)); __set_test_and_inuse(sbi, new_segno); @@ -2705,7 +2703,7 @@ static void get_atssr_segment(struct f2fs_sb_info *sbi, int type, struct seg_entry *se = get_seg_entry(sbi, curseg->next_segno); curseg->seg_type = se->type; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } else { /* allocate cold segment by default */ curseg->seg_type = CURSEG_COLD_DATA; @@ -2880,7 +2878,7 @@ void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, goto unlock; if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type, SSR, 0)) - change_curseg(sbi, type, true); + change_curseg(sbi, type); else new_curseg(sbi, type, true); @@ -3276,7 +3274,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, if (need_new_seg(sbi, type)) new_curseg(sbi, type, false); else - change_curseg(sbi, type, true); + change_curseg(sbi, type); stat_inc_seg_type(sbi, curseg); } } @@ -3538,7 +3536,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, /* change the current segment */ if (segno != curseg->segno) { curseg->next_segno = segno; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); @@ -3566,7 +3564,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (recover_curseg) { if (old_cursegno != curseg->segno) { curseg->next_segno = old_cursegno; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } curseg->next_blkoff = old_blkoff; curseg->alloc_type = old_alloc_type; From 38b6184e495413afb4472840ea8e5c996aa05b66 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 26 Nov 2022 10:38:07 +0800 Subject: [PATCH 545/580] MAINTAINERS: Add f2fs bug tracker link As f2fs component in bugzilla.kernel.org was created and used since 2018-7. Signed-off-by: Chao Yu Signed-off-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 6 +++++- MAINTAINERS | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 27cca9e4c3df..6fd72befee71 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -25,10 +25,14 @@ a consistency checking tool (fsck.f2fs), and a debugging tool (dump.f2fs). - git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs-tools.git -For reporting bugs and sending patches, please use the following mailing list: +For sending patches, please use the following mailing list: - linux-f2fs-devel@lists.sourceforge.net +For reporting bugs, please use the following f2fs bug tracker link: + +- https://bugzilla.kernel.org/enter_bug.cgi?product=File%20System&component=f2fs + Background and Design issues ============================ diff --git a/MAINTAINERS b/MAINTAINERS index 1f011aac6d2b..7fc1386781da 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5588,6 +5588,7 @@ M: Jaegeuk Kim M: Chao Yu L: linux-f2fs-devel@lists.sourceforge.net W: https://f2fs.wiki.kernel.org/ +B: https://bugzilla.kernel.org/enter_bug.cgi?product=File%20System&component=f2fs T: git git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs.git S: Maintained F: Documentation/filesystems/f2fs.txt From de60d859b98fba9c8a37647743f0ddfdce315a37 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Fri, 25 Nov 2022 19:47:36 +0800 Subject: [PATCH 546/580] f2fs: do some cleanup for f2fs module init Just for cleanup, no functional changes. Signed-off-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 46 ++++++---------------------------------------- fs/f2fs/data.c | 14 ++++---------- fs/f2fs/gc.c | 4 +--- fs/f2fs/recovery.c | 4 +--- fs/f2fs/super.c | 8 ++------ 5 files changed, 14 insertions(+), 62 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 4b0b655d8487..14d9f9292e90 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -537,10 +537,7 @@ MODULE_PARM_DESC(num_compress_pages, int f2fs_init_compress_mempool(void) { compress_page_pool = mempool_create_page_pool(num_compress_pages, 0); - if (!compress_page_pool) - return -ENOMEM; - - return 0; + return compress_page_pool ? 0 : -ENOMEM; } void f2fs_destroy_compress_mempool(void) @@ -1958,9 +1955,7 @@ int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) sbi->page_array_slab = f2fs_kmem_cache_create(slab_name, sbi->page_array_slab_size); - if (!sbi->page_array_slab) - return -ENOMEM; - return 0; + return sbi->page_array_slab ? 0 : -ENOMEM; } void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) @@ -1968,53 +1963,24 @@ void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) kmem_cache_destroy(sbi->page_array_slab); } -static int __init f2fs_init_cic_cache(void) +int __init f2fs_init_compress_cache(void) { cic_entry_slab = f2fs_kmem_cache_create("f2fs_cic_entry", sizeof(struct compress_io_ctx)); if (!cic_entry_slab) return -ENOMEM; - return 0; -} - -static void f2fs_destroy_cic_cache(void) -{ - kmem_cache_destroy(cic_entry_slab); -} - -static int __init f2fs_init_dic_cache(void) -{ dic_entry_slab = f2fs_kmem_cache_create("f2fs_dic_entry", sizeof(struct decompress_io_ctx)); if (!dic_entry_slab) - return -ENOMEM; - return 0; -} - -static void f2fs_destroy_dic_cache(void) -{ - kmem_cache_destroy(dic_entry_slab); -} - -int __init f2fs_init_compress_cache(void) -{ - int err; - - err = f2fs_init_cic_cache(); - if (err) - goto out; - err = f2fs_init_dic_cache(); - if (err) goto free_cic; return 0; free_cic: - f2fs_destroy_cic_cache(); -out: + kmem_cache_destroy(cic_entry_slab); return -ENOMEM; } void f2fs_destroy_compress_cache(void) { - f2fs_destroy_dic_cache(); - f2fs_destroy_cic_cache(); + kmem_cache_destroy(dic_entry_slab); + kmem_cache_destroy(cic_entry_slab); } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a7fcb564c29e..9abf471a03f9 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -37,10 +37,8 @@ static struct bio_set f2fs_bioset; int __init f2fs_init_bioset(void) { - if (bioset_init(&f2fs_bioset, F2FS_BIO_POOL_SIZE, - 0, BIOSET_NEED_BVECS)) - return -ENOMEM; - return 0; + return bioset_init(&f2fs_bioset, F2FS_BIO_POOL_SIZE, + 0, BIOSET_NEED_BVECS); } void f2fs_destroy_bioset(void) @@ -4326,9 +4324,7 @@ int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi) sbi->post_read_wq = alloc_workqueue("f2fs_post_read_wq", WQ_UNBOUND | WQ_HIGHPRI, num_online_cpus()); - if (!sbi->post_read_wq) - return -ENOMEM; - return 0; + return sbi->post_read_wq ? 0 : -ENOMEM; } void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi) @@ -4341,9 +4337,7 @@ int __init f2fs_init_bio_entry_cache(void) { bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab", sizeof(struct bio_entry)); - if (!bio_entry_slab) - return -ENOMEM; - return 0; + return bio_entry_slab ? 0 : -ENOMEM; } void f2fs_destroy_bio_entry_cache(void) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 7efaff5bac90..ca57bd4fb784 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1903,9 +1903,7 @@ int __init f2fs_create_garbage_collection_cache(void) { victim_entry_slab = f2fs_kmem_cache_create("f2fs_victim_entry", sizeof(struct victim_entry)); - if (!victim_entry_slab) - return -ENOMEM; - return 0; + return victim_entry_slab ? 0 : -ENOMEM; } void f2fs_destroy_garbage_collection_cache(void) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 5991391733b4..dd655883d81c 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -907,9 +907,7 @@ int __init f2fs_create_recovery_cache(void) { fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", sizeof(struct fsync_inode_entry)); - if (!fsync_entry_slab) - return -ENOMEM; - return 0; + return fsync_entry_slab ? 0 : -ENOMEM; } void f2fs_destroy_recovery_cache(void) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b4dbe0c4f483..dae12f6fe8e3 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -291,9 +291,7 @@ static int __init f2fs_create_casefold_cache(void) { f2fs_cf_name_slab = f2fs_kmem_cache_create("f2fs_casefolded_name", F2FS_NAME_LEN); - if (!f2fs_cf_name_slab) - return -ENOMEM; - return 0; + return f2fs_cf_name_slab ? 0 : -ENOMEM; } static void f2fs_destroy_casefold_cache(void) @@ -4590,9 +4588,7 @@ static int __init init_inodecache(void) f2fs_inode_cachep = kmem_cache_create("f2fs_inode_cache", sizeof(struct f2fs_inode_info), 0, SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, NULL); - if (!f2fs_inode_cachep) - return -ENOMEM; - return 0; + return f2fs_inode_cachep ? 0 : -ENOMEM; } static void destroy_inodecache(void) From 32bc64aaa35ec48f7b0f6fa5a7c85aa8ef49e061 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Thu, 24 Nov 2022 11:37:08 +0800 Subject: [PATCH 547/580] f2fs: remove F2FS_SET_FEATURE() and F2FS_CLEAR_FEATURE() macro F2FS_SET_FEATURE() and F2FS_CLEAR_FEATURE() have never been used since they were introduced by this commit 76f105a2dbcd("f2fs: add feature facility in superblock"). So let's remove them. BTW, convert f2fs_sb_has_##name to return bool. Signed-off-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0bcefba8bd2a..16533f3b0a86 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -199,10 +199,6 @@ struct f2fs_mount_info { #define __F2FS_HAS_FEATURE(raw_super, mask) \ ((raw_super->feature & cpu_to_le32(mask)) != 0) #define F2FS_HAS_FEATURE(sbi, mask) __F2FS_HAS_FEATURE(sbi->raw_super, mask) -#define F2FS_SET_FEATURE(sbi, mask) \ - (sbi->raw_super->feature |= cpu_to_le32(mask)) -#define F2FS_CLEAR_FEATURE(sbi, mask) \ - (sbi->raw_super->feature &= ~cpu_to_le32(mask)) /* * Default values for user and/or group using reserved blocks @@ -4428,7 +4424,7 @@ static inline bool f2fs_disable_compressed_file(struct inode *inode) } #define F2FS_FEATURE_FUNCS(name, flagname) \ -static inline int f2fs_sb_has_##name(struct f2fs_sb_info *sbi) \ +static inline bool f2fs_sb_has_##name(struct f2fs_sb_info *sbi) \ { \ return F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_##flagname); \ } From 330e9aff0c9328ef896c899a90d1240b8e4712b2 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Thu, 24 Nov 2022 10:48:42 +0800 Subject: [PATCH 548/580] f2fs: introduce f2fs_is_readonly() for readability Introduce f2fs_is_readonly() and use it to simplify code. Signed-off-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 5 +++++ fs/f2fs/super.c | 5 ++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 16533f3b0a86..7b0cbf2f774f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4674,6 +4674,11 @@ static inline void f2fs_handle_page_eio(struct f2fs_sb_info *sbi, pgoff_t ofs, } } +static inline bool f2fs_is_readonly(struct f2fs_sb_info *sbi) +{ + return f2fs_sb_has_readonly(sbi) || f2fs_readonly(sbi->sb); +} + #define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index dae12f6fe8e3..801de00b3975 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1327,8 +1327,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) return -EINVAL; } - if ((f2fs_sb_has_readonly(sbi) || f2fs_readonly(sbi->sb)) - && test_opt(sbi, FLUSH_MERGE)) { + if (f2fs_is_readonly(sbi) && test_opt(sbi, FLUSH_MERGE)) { f2fs_err(sbi, "FLUSH_MERGE not compatible with readonly mode"); return -EINVAL; } @@ -2062,7 +2061,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, MERGE_CHECKPOINT); F2FS_OPTION(sbi).unusable_cap = 0; sbi->sb->s_flags |= SB_LAZYTIME; - if (!(f2fs_sb_has_readonly(sbi) || f2fs_readonly(sbi->sb))) + if (!f2fs_is_readonly(sbi)) set_opt(sbi, FLUSH_MERGE); if (f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) set_opt(sbi, DISCARD); From 33654719732ea5e8d5cc9373c8773e687796ada0 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 30 Nov 2022 09:36:43 -0800 Subject: [PATCH 549/580] f2fs: specify extent cache for read explicitly Let's descrbie it's read extent cache. Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 4 ++-- fs/f2fs/f2fs.h | 10 +++++----- fs/f2fs/inode.c | 2 +- fs/f2fs/node.c | 2 +- fs/f2fs/node.h | 2 +- fs/f2fs/segment.c | 4 ++-- fs/f2fs/super.c | 12 ++++++------ 7 files changed, 18 insertions(+), 18 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 932c070173b9..8cd87aee0292 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -383,7 +383,7 @@ static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage) if (!i_ext || !i_ext->len) return; - get_extent_info(&ei, i_ext); + get_read_extent_info(&ei, i_ext); write_lock(&et->lock); if (atomic_read(&et->node_cnt)) @@ -710,7 +710,7 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) unsigned int node_cnt = 0, tree_cnt = 0; int remained; - if (!test_opt(sbi, EXTENT_CACHE)) + if (!test_opt(sbi, READ_EXTENT_CACHE)) return 0; if (!atomic_read(&sbi->total_zombie_tree)) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7b0cbf2f774f..7fa6c3936232 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -88,7 +88,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_FLUSH_MERGE 0x00000400 #define F2FS_MOUNT_NOBARRIER 0x00000800 #define F2FS_MOUNT_FASTBOOT 0x00001000 -#define F2FS_MOUNT_EXTENT_CACHE 0x00002000 +#define F2FS_MOUNT_READ_EXTENT_CACHE 0x00002000 #define F2FS_MOUNT_DATA_FLUSH 0x00008000 #define F2FS_MOUNT_FAULT_INJECTION 0x00010000 #define F2FS_MOUNT_USRQUOTA 0x00080000 @@ -596,7 +596,7 @@ enum { #define F2FS_MIN_EXTENT_LEN 64 /* minimum extent length */ /* number of extent info in extent cache we try to shrink */ -#define EXTENT_CACHE_SHRINK_NUMBER 128 +#define READ_EXTENT_CACHE_SHRINK_NUMBER 128 #define RECOVERY_MAX_RA_BLOCKS BIO_MAX_PAGES #define RECOVERY_MIN_RA_BLOCKS 1 @@ -828,7 +828,7 @@ struct f2fs_inode_info { loff_t original_i_size; /* original i_size before atomic write */ }; -static inline void get_extent_info(struct extent_info *ext, +static inline void get_read_extent_info(struct extent_info *ext, struct f2fs_extent *i_ext) { ext->fofs = le32_to_cpu(i_ext->fofs); @@ -836,7 +836,7 @@ static inline void get_extent_info(struct extent_info *ext, ext->len = le32_to_cpu(i_ext->len); } -static inline void set_raw_extent(struct extent_info *ext, +static inline void set_raw_read_extent(struct extent_info *ext, struct f2fs_extent *i_ext) { i_ext->fofs = cpu_to_le32(ext->fofs); @@ -4448,7 +4448,7 @@ static inline bool f2fs_may_extent_tree(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - if (!test_opt(sbi, EXTENT_CACHE) || + if (!test_opt(sbi, READ_EXTENT_CACHE) || is_inode_flag_set(inode, FI_NO_EXTENT) || (is_inode_flag_set(inode, FI_COMPRESSED_FILE) && !f2fs_sb_has_readonly(sbi))) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 50220a93a5a2..4207d4c2081a 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -629,7 +629,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) if (et) { read_lock(&et->lock); - set_raw_extent(&et->largest, &ri->i_ext); + set_raw_read_extent(&et->largest, &ri->i_ext); read_unlock(&et->lock); } else { memset(&ri->i_ext, 0, sizeof(ri->i_ext)); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index a1984bf22177..4f60b19e5fb2 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -85,7 +85,7 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) sizeof(struct ino_entry); mem_size >>= PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); - } else if (type == EXTENT_CACHE) { + } else if (type == READ_EXTENT_CACHE) { mem_size = (atomic_read(&sbi->total_ext_tree) * sizeof(struct extent_tree) + atomic_read(&sbi->total_ext_node) * diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 3c09cae058b0..0aa48704c77a 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -146,7 +146,7 @@ enum mem_type { NAT_ENTRIES, /* indicates the cached nat entry */ DIRTY_DENTS, /* indicates dirty dentry pages */ INO_ENTRIES, /* indicates inode entries */ - EXTENT_CACHE, /* indicates extent cache */ + READ_EXTENT_CACHE, /* indicates read extent cache */ DISCARD_CACHE, /* indicates memory of cached discard cmds */ COMPRESS_PAGE, /* indicates memory of cached compressed pages */ BASE_CHECK, /* check kernel status */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 618930fd0a97..1ea2c253aae7 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -448,8 +448,8 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) return; /* try to shrink extent cache when there is no enough memory */ - if (!f2fs_available_free_memory(sbi, EXTENT_CACHE)) - f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER); + if (!f2fs_available_free_memory(sbi, READ_EXTENT_CACHE)) + f2fs_shrink_extent_tree(sbi, READ_EXTENT_CACHE_SHRINK_NUMBER); /* check the # of cached NAT entries */ if (!f2fs_available_free_memory(sbi, NAT_ENTRIES)) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 801de00b3975..46a80121fe0b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -805,10 +805,10 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) set_opt(sbi, FASTBOOT); break; case Opt_extent_cache: - set_opt(sbi, EXTENT_CACHE); + set_opt(sbi, READ_EXTENT_CACHE); break; case Opt_noextent_cache: - clear_opt(sbi, EXTENT_CACHE); + clear_opt(sbi, READ_EXTENT_CACHE); break; case Opt_noinline_data: clear_opt(sbi, INLINE_DATA); @@ -1938,7 +1938,7 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",barrier"); if (test_opt(sbi, FASTBOOT)) seq_puts(seq, ",fastboot"); - if (test_opt(sbi, EXTENT_CACHE)) + if (test_opt(sbi, READ_EXTENT_CACHE)) seq_puts(seq, ",extent_cache"); else seq_puts(seq, ",noextent_cache"); @@ -2055,7 +2055,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, INLINE_XATTR); set_opt(sbi, INLINE_DATA); set_opt(sbi, INLINE_DENTRY); - set_opt(sbi, EXTENT_CACHE); + set_opt(sbi, READ_EXTENT_CACHE); set_opt(sbi, NOHEAP); clear_opt(sbi, DISABLE_CHECKPOINT); set_opt(sbi, MERGE_CHECKPOINT); @@ -2197,7 +2197,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool need_restart_ckpt = false, need_stop_ckpt = false; bool need_restart_flush = false, need_stop_flush = false; bool need_restart_discard = false, need_stop_discard = false; - bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); + bool no_read_extent_cache = !test_opt(sbi, READ_EXTENT_CACHE); bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT); bool no_io_align = !F2FS_IO_ALIGNED(sbi); bool no_atgc = !test_opt(sbi, ATGC); @@ -2287,7 +2287,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } /* disallow enable/disable extent_cache dynamically */ - if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) { + if (no_read_extent_cache == !!test_opt(sbi, READ_EXTENT_CACHE)) { err = -EINVAL; f2fs_warn(sbi, "switch extent_cache option is not allowed"); goto restore_opts; From 8715442806c2a2decdd69d5933aaab627cbcc051 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 30 Nov 2022 09:44:58 -0800 Subject: [PATCH 550/580] f2fs: move internal functions into extent_cache.c No functional change. Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 88 +++++++++++++++++++++++++++++++++++++----- fs/f2fs/f2fs.h | 69 +-------------------------------- 2 files changed, 81 insertions(+), 76 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 8cd87aee0292..2a8e31e6d518 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -15,6 +15,77 @@ #include "node.h" #include +static void __set_extent_info(struct extent_info *ei, + unsigned int fofs, unsigned int len, + block_t blk, bool keep_clen) +{ + ei->fofs = fofs; + ei->blk = blk; + ei->len = len; + + if (keep_clen) + return; + +#ifdef CONFIG_F2FS_FS_COMPRESSION + ei->c_len = 0; +#endif +} + +static bool f2fs_may_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + /* + * for recovered files during mount do not create extents + * if shrinker is not registered. + */ + if (list_empty(&sbi->s_list)) + return false; + + if (!test_opt(sbi, READ_EXTENT_CACHE) || + is_inode_flag_set(inode, FI_NO_EXTENT) || + (is_inode_flag_set(inode, FI_COMPRESSED_FILE) && + !f2fs_sb_has_readonly(sbi))) + return false; + + return S_ISREG(inode->i_mode); +} + +static void __try_update_largest_extent(struct extent_tree *et, + struct extent_node *en) +{ + if (en->ei.len <= et->largest.len) + return; + + et->largest = en->ei; + et->largest_updated = true; +} + +static bool __is_extent_mergeable(struct extent_info *back, + struct extent_info *front) +{ +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (back->c_len && back->len != back->c_len) + return false; + if (front->c_len && front->len != front->c_len) + return false; +#endif + return (back->fofs + back->len == front->fofs && + back->blk + back->len == front->blk); +} + +static bool __is_back_mergeable(struct extent_info *cur, + struct extent_info *back) +{ + return __is_extent_mergeable(back, cur); +} + +static bool __is_front_mergeable(struct extent_info *cur, + struct extent_info *front) +{ + return __is_extent_mergeable(cur, front); +} + static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re, unsigned int ofs) { @@ -591,16 +662,16 @@ static void f2fs_update_extent_tree_range(struct inode *inode, if (end < org_end && org_end - end >= F2FS_MIN_EXTENT_LEN) { if (parts) { - set_extent_info(&ei, end, - end - dei.fofs + dei.blk, - org_end - end); + __set_extent_info(&ei, + end, org_end - end, + end - dei.fofs + dei.blk, false); en1 = __insert_extent_tree(sbi, et, &ei, NULL, NULL, true); next_en = en1; } else { - en->ei.fofs = end; - en->ei.blk += end - dei.fofs; - en->ei.len -= end - dei.fofs; + __set_extent_info(&en->ei, + end, en->ei.len - (end - dei.fofs), + en->ei.blk + (end - dei.fofs), true); next_en = en; } parts++; @@ -632,8 +703,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode, /* 3. update extent in extent cache */ if (blkaddr) { - - set_extent_info(&ei, fofs, blkaddr, len); + __set_extent_info(&ei, fofs, len, blkaddr, false); if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) __insert_extent_tree(sbi, et, &ei, insert_p, insert_parent, leftmost); @@ -692,7 +762,7 @@ void f2fs_update_extent_tree_range_compressed(struct inode *inode, if (en) goto unlock_out; - set_extent_info(&ei, fofs, blkaddr, llen); + __set_extent_info(&ei, fofs, llen, blkaddr, true); ei.c_len = c_len; if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7fa6c3936232..7032c38f736e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -617,7 +617,7 @@ struct rb_entry { struct extent_info { unsigned int fofs; /* start offset in a file */ unsigned int len; /* length of the extent */ - u32 blk; /* start block address of the extent */ + block_t blk; /* start block address of the extent */ #ifdef CONFIG_F2FS_FS_COMPRESSION unsigned int c_len; /* physical extent length of compressed blocks */ #endif @@ -844,17 +844,6 @@ static inline void set_raw_read_extent(struct extent_info *ext, i_ext->len = cpu_to_le32(ext->len); } -static inline void set_extent_info(struct extent_info *ei, unsigned int fofs, - u32 blk, unsigned int len) -{ - ei->fofs = fofs; - ei->blk = blk; - ei->len = len; -#ifdef CONFIG_F2FS_FS_COMPRESSION - ei->c_len = 0; -#endif -} - static inline bool __is_discard_mergeable(struct discard_info *back, struct discard_info *front, unsigned int max_len) { @@ -874,41 +863,6 @@ static inline bool __is_discard_front_mergeable(struct discard_info *cur, return __is_discard_mergeable(cur, front, max_len); } -static inline bool __is_extent_mergeable(struct extent_info *back, - struct extent_info *front) -{ -#ifdef CONFIG_F2FS_FS_COMPRESSION - if (back->c_len && back->len != back->c_len) - return false; - if (front->c_len && front->len != front->c_len) - return false; -#endif - return (back->fofs + back->len == front->fofs && - back->blk + back->len == front->blk); -} - -static inline bool __is_back_mergeable(struct extent_info *cur, - struct extent_info *back) -{ - return __is_extent_mergeable(back, cur); -} - -static inline bool __is_front_mergeable(struct extent_info *cur, - struct extent_info *front) -{ - return __is_extent_mergeable(cur, front); -} - -extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync); -static inline void __try_update_largest_extent(struct extent_tree *et, - struct extent_node *en) -{ - if (en->ei.len > et->largest.len) { - et->largest = en->ei; - et->largest_updated = true; - } -} - /* * For free nid management */ @@ -2586,6 +2540,7 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); } +extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync); static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, struct inode *inode, bool is_inode) { @@ -4444,26 +4399,6 @@ F2FS_FEATURE_FUNCS(casefold, CASEFOLD); F2FS_FEATURE_FUNCS(compression, COMPRESSION); F2FS_FEATURE_FUNCS(readonly, RO); -static inline bool f2fs_may_extent_tree(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - - if (!test_opt(sbi, READ_EXTENT_CACHE) || - is_inode_flag_set(inode, FI_NO_EXTENT) || - (is_inode_flag_set(inode, FI_COMPRESSED_FILE) && - !f2fs_sb_has_readonly(sbi))) - return false; - - /* - * for recovered files during mount do not create extents - * if shrinker is not registered. - */ - if (list_empty(&sbi->s_list)) - return false; - - return S_ISREG(inode->i_mode); -} - #ifdef CONFIG_BLK_DEV_ZONED static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi, block_t blkaddr) From de9b22351dd379f7b364f6dde35368c37ecb2782 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 30 Nov 2022 10:01:18 -0800 Subject: [PATCH 551/580] f2fs: remove unnecessary __init_extent_tree Added into the caller. Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 2a8e31e6d518..c6810347e205 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -386,21 +386,6 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) return et; } -static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi, - struct extent_tree *et, struct extent_info *ei) -{ - struct rb_node **p = &et->root.rb_root.rb_node; - struct extent_node *en; - - en = __attach_extent_node(sbi, et, ei, NULL, p, true); - if (!en) - return NULL; - - et->largest = en->ei; - et->cached_en = en; - return en; -} - static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, struct extent_tree *et) { @@ -460,8 +445,12 @@ static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage) if (atomic_read(&et->node_cnt)) goto out; - en = __init_extent_tree(sbi, et, &ei); + en = __attach_extent_node(sbi, et, &ei, NULL, + &et->root.rb_root.rb_node, true); if (en) { + et->largest = en->ei; + et->cached_en = en; + spin_lock(&sbi->extent_lock); list_add_tail(&en->list, &sbi->extent_list); spin_unlock(&sbi->extent_lock); From 484b267ce4724e93ac464265453024fc40ac893e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 30 Nov 2022 09:26:29 -0800 Subject: [PATCH 552/580] f2fs: refactor extent_cache to support for read and more This patch prepares extent_cache to be ready for addition. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 20 +- fs/f2fs/debug.c | 65 +++-- fs/f2fs/extent_cache.c | 465 +++++++++++++++++++++--------------- fs/f2fs/f2fs.h | 119 +++++---- fs/f2fs/file.c | 8 +- fs/f2fs/gc.c | 4 +- fs/f2fs/inode.c | 6 +- fs/f2fs/node.c | 8 +- fs/f2fs/segment.c | 3 +- fs/f2fs/shrinker.c | 19 +- include/trace/events/f2fs.h | 62 +++-- 11 files changed, 471 insertions(+), 308 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9abf471a03f9..afb9a9e546a1 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1088,7 +1088,7 @@ void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) { dn->data_blkaddr = blkaddr; f2fs_set_data_blkaddr(dn); - f2fs_update_extent_cache(dn); + f2fs_update_read_extent_cache(dn); } /* dn->ofs_in_node will be returned with up-to-date last block pointer */ @@ -1157,7 +1157,7 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) struct extent_info ei = {0, }; struct inode *inode = dn->inode; - if (f2fs_lookup_extent_cache(inode, index, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { dn->data_blkaddr = ei.blk + index - ei.fofs; return 0; } @@ -1179,7 +1179,7 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, if (!page) return ERR_PTR(-ENOMEM); - if (f2fs_lookup_extent_cache(inode, index, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { dn.data_blkaddr = ei.blk + index - ei.fofs; if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr, DATA_GENERIC_ENHANCE_READ)) { @@ -1452,7 +1452,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, pgofs = (pgoff_t)map->m_lblk; end = pgofs + maxblocks; - if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) { + if (!create && f2fs_lookup_read_extent_cache(inode, pgofs, &ei)) { if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO && map->m_may_create) goto next_dnode; @@ -1662,7 +1662,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, if (map->m_flags & F2FS_MAP_MAPPED) { unsigned int ofs = start_pgofs - map->m_lblk; - f2fs_update_extent_cache_range(&dn, + f2fs_update_read_extent_cache_range(&dn, start_pgofs, map->m_pblk + ofs, map->m_len - ofs); } @@ -1707,7 +1707,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, if (map->m_flags & F2FS_MAP_MAPPED) { unsigned int ofs = start_pgofs - map->m_lblk; - f2fs_update_extent_cache_range(&dn, + f2fs_update_read_extent_cache_range(&dn, start_pgofs, map->m_pblk + ofs, map->m_len - ofs); } @@ -2222,7 +2222,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, if (f2fs_cluster_is_empty(cc)) goto out; - if (f2fs_lookup_extent_cache(inode, start_idx, &ei)) + if (f2fs_lookup_read_extent_cache(inode, start_idx, &ei)) from_dnode = false; if (!from_dnode) @@ -2687,7 +2687,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) set_new_dnode(&dn, inode, NULL, NULL, 0); if (need_inplace_update(fio) && - f2fs_lookup_extent_cache(inode, page->index, &ei)) { + f2fs_lookup_read_extent_cache(inode, page->index, &ei)) { fio->old_blkaddr = ei.blk + page->index - ei.fofs; if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, @@ -3413,7 +3413,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, } else if (locked) { err = f2fs_get_block(&dn, index); } else { - if (f2fs_lookup_extent_cache(inode, index, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { dn.data_blkaddr = ei.blk + index - ei.fofs; } else { /* hole case */ @@ -3454,7 +3454,7 @@ static int __find_data_block(struct inode *inode, pgoff_t index, set_new_dnode(&dn, inode, ipage, ipage, 0); - if (f2fs_lookup_extent_cache(inode, index, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { dn.data_blkaddr = ei.blk + index - ei.fofs; } else { /* hole case */ diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index a216dcdf6941..a9baa121d829 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -72,15 +72,23 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->main_area_zones = si->main_area_sections / le32_to_cpu(raw_super->secs_per_zone); - /* validation check of the segment numbers */ + /* general extent cache stats */ + for (i = 0; i < NR_EXTENT_CACHES; i++) { + struct extent_tree_info *eti = &sbi->extent_tree[i]; + + si->hit_cached[i] = atomic64_read(&sbi->read_hit_cached[i]); + si->hit_rbtree[i] = atomic64_read(&sbi->read_hit_rbtree[i]); + si->total_ext[i] = atomic64_read(&sbi->total_hit_ext[i]); + si->hit_total[i] = si->hit_cached[i] + si->hit_rbtree[i]; + si->ext_tree[i] = atomic_read(&eti->total_ext_tree); + si->zombie_tree[i] = atomic_read(&eti->total_zombie_tree); + si->ext_node[i] = atomic_read(&eti->total_ext_node); + } + /* read extent_cache only */ si->hit_largest = atomic64_read(&sbi->read_hit_largest); - si->hit_cached = atomic64_read(&sbi->read_hit_cached); - si->hit_rbtree = atomic64_read(&sbi->read_hit_rbtree); - si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree; - si->total_ext = atomic64_read(&sbi->total_hit_ext); - si->ext_tree = atomic_read(&sbi->total_ext_tree); - si->zombie_tree = atomic_read(&sbi->total_zombie_tree); - si->ext_node = atomic_read(&sbi->total_ext_node); + si->hit_total[EX_READ] += si->hit_largest; + + /* validation check of the segment numbers */ si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS); si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META); @@ -294,10 +302,16 @@ static void update_mem_info(struct f2fs_sb_info *sbi) sizeof(struct nat_entry_set); for (i = 0; i < MAX_INO_ENTRY; i++) si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); - si->cache_mem += atomic_read(&sbi->total_ext_tree) * + + for (i = 0; i < NR_EXTENT_CACHES; i++) { + struct extent_tree_info *eti = &sbi->extent_tree[i]; + + si->ext_mem[i] = atomic_read(&eti->total_ext_tree) * sizeof(struct extent_tree); - si->cache_mem += atomic_read(&sbi->total_ext_node) * + si->ext_mem[i] += atomic_read(&eti->total_ext_node) * sizeof(struct extent_node); + si->cache_mem += si->ext_mem[i]; + } si->page_mem = 0; if (sbi->node_inode) { @@ -490,16 +504,18 @@ static int stat_show(struct seq_file *s, void *v) si->bg_node_blks); seq_printf(s, "BG skip : IO: %u, Other: %u\n", si->io_skip_bggc, si->other_skip_bggc); - seq_puts(s, "\nExtent Cache:\n"); + seq_puts(s, "\nExtent Cache (Read):\n"); seq_printf(s, " - Hit Count: L1-1:%llu L1-2:%llu L2:%llu\n", - si->hit_largest, si->hit_cached, - si->hit_rbtree); + si->hit_largest, si->hit_cached[EX_READ], + si->hit_rbtree[EX_READ]); seq_printf(s, " - Hit Ratio: %llu%% (%llu / %llu)\n", - !si->total_ext ? 0 : - div64_u64(si->hit_total * 100, si->total_ext), - si->hit_total, si->total_ext); + !si->total_ext[EX_READ] ? 0 : + div64_u64(si->hit_total[EX_READ] * 100, + si->total_ext[EX_READ]), + si->hit_total[EX_READ], si->total_ext[EX_READ]); seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", - si->ext_tree, si->zombie_tree, si->ext_node); + si->ext_tree[EX_READ], si->zombie_tree[EX_READ], + si->ext_node[EX_READ]); seq_puts(s, "\nBalancing F2FS Async:\n"); seq_printf(s, " - DIO (R: %4d, W: %4d)\n", si->nr_dio_read, si->nr_dio_write); @@ -566,8 +582,10 @@ static int stat_show(struct seq_file *s, void *v) (si->base_mem + si->cache_mem + si->page_mem) >> 10); seq_printf(s, " - static: %llu KB\n", si->base_mem >> 10); - seq_printf(s, " - cached: %llu KB\n", + seq_printf(s, " - cached all: %llu KB\n", si->cache_mem >> 10); + seq_printf(s, " - read extent cache: %llu KB\n", + si->ext_mem[EX_READ] >> 10); seq_printf(s, " - paged : %llu KB\n", si->page_mem >> 10); } @@ -600,10 +618,15 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) si->sbi = sbi; sbi->stat_info = si; - atomic64_set(&sbi->total_hit_ext, 0); - atomic64_set(&sbi->read_hit_rbtree, 0); + /* general extent cache stats */ + for (i = 0; i < NR_EXTENT_CACHES; i++) { + atomic64_set(&sbi->total_hit_ext[i], 0); + atomic64_set(&sbi->read_hit_rbtree[i], 0); + atomic64_set(&sbi->read_hit_cached[i], 0); + } + + /* read extent_cache only */ atomic64_set(&sbi->read_hit_largest, 0); - atomic64_set(&sbi->read_hit_cached, 0); atomic_set(&sbi->inline_xattr, 0); atomic_set(&sbi->inline_inode, 0); diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index c6810347e205..654a14ab8977 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -17,21 +17,37 @@ static void __set_extent_info(struct extent_info *ei, unsigned int fofs, unsigned int len, - block_t blk, bool keep_clen) + block_t blk, bool keep_clen, + enum extent_type type) { ei->fofs = fofs; - ei->blk = blk; ei->len = len; - if (keep_clen) - return; - + if (type == EX_READ) { + ei->blk = blk; + if (keep_clen) + return; #ifdef CONFIG_F2FS_FS_COMPRESSION - ei->c_len = 0; + ei->c_len = 0; #endif + } } -static bool f2fs_may_extent_tree(struct inode *inode) +static bool __may_read_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (!test_opt(sbi, READ_EXTENT_CACHE)) + return false; + if (is_inode_flag_set(inode, FI_NO_EXTENT)) + return false; + if (is_inode_flag_set(inode, FI_COMPRESSED_FILE) && + !f2fs_sb_has_readonly(sbi)) + return false; + return S_ISREG(inode->i_mode); +} + +static bool __may_extent_tree(struct inode *inode, enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -42,18 +58,16 @@ static bool f2fs_may_extent_tree(struct inode *inode) if (list_empty(&sbi->s_list)) return false; - if (!test_opt(sbi, READ_EXTENT_CACHE) || - is_inode_flag_set(inode, FI_NO_EXTENT) || - (is_inode_flag_set(inode, FI_COMPRESSED_FILE) && - !f2fs_sb_has_readonly(sbi))) - return false; - - return S_ISREG(inode->i_mode); + if (type == EX_READ) + return __may_read_extent_tree(inode); + return false; } static void __try_update_largest_extent(struct extent_tree *et, struct extent_node *en) { + if (et->type != EX_READ) + return; if (en->ei.len <= et->largest.len) return; @@ -62,28 +76,31 @@ static void __try_update_largest_extent(struct extent_tree *et, } static bool __is_extent_mergeable(struct extent_info *back, - struct extent_info *front) + struct extent_info *front, enum extent_type type) { + if (type == EX_READ) { #ifdef CONFIG_F2FS_FS_COMPRESSION - if (back->c_len && back->len != back->c_len) - return false; - if (front->c_len && front->len != front->c_len) - return false; + if (back->c_len && back->len != back->c_len) + return false; + if (front->c_len && front->len != front->c_len) + return false; #endif - return (back->fofs + back->len == front->fofs && - back->blk + back->len == front->blk); + return (back->fofs + back->len == front->fofs && + back->blk + back->len == front->blk); + } + return false; } static bool __is_back_mergeable(struct extent_info *cur, - struct extent_info *back) + struct extent_info *back, enum extent_type type) { - return __is_extent_mergeable(back, cur); + return __is_extent_mergeable(back, cur, type); } static bool __is_front_mergeable(struct extent_info *cur, - struct extent_info *front) + struct extent_info *front, enum extent_type type) { - return __is_extent_mergeable(cur, front); + return __is_extent_mergeable(cur, front, type); } static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re, @@ -308,6 +325,7 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, struct rb_node *parent, struct rb_node **p, bool leftmost) { + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; struct extent_node *en; en = f2fs_kmem_cache_alloc(extent_node_slab, GFP_ATOMIC, false, sbi); @@ -321,16 +339,18 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, rb_link_node(&en->rb_node, parent, p); rb_insert_color_cached(&en->rb_node, &et->root, leftmost); atomic_inc(&et->node_cnt); - atomic_inc(&sbi->total_ext_node); + atomic_inc(&eti->total_ext_node); return en; } static void __detach_extent_node(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_node *en) { + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; + rb_erase_cached(&en->rb_node, &et->root); atomic_dec(&et->node_cnt); - atomic_dec(&sbi->total_ext_node); + atomic_dec(&eti->total_ext_node); if (et->cached_en == en) et->cached_en = NULL; @@ -346,42 +366,47 @@ static void __detach_extent_node(struct f2fs_sb_info *sbi, static void __release_extent_node(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_node *en) { - spin_lock(&sbi->extent_lock); + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; + + spin_lock(&eti->extent_lock); f2fs_bug_on(sbi, list_empty(&en->list)); list_del_init(&en->list); - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); __detach_extent_node(sbi, et, en); } -static struct extent_tree *__grab_extent_tree(struct inode *inode) +static struct extent_tree *__grab_extent_tree(struct inode *inode, + enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree_info *eti = &sbi->extent_tree[type]; struct extent_tree *et; nid_t ino = inode->i_ino; - mutex_lock(&sbi->extent_tree_lock); - et = radix_tree_lookup(&sbi->extent_tree_root, ino); + mutex_lock(&eti->extent_tree_lock); + et = radix_tree_lookup(&eti->extent_tree_root, ino); if (!et) { et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS, true, NULL); - f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et); + f2fs_radix_tree_insert(&eti->extent_tree_root, ino, et); memset(et, 0, sizeof(struct extent_tree)); et->ino = ino; + et->type = type; et->root = RB_ROOT_CACHED; et->cached_en = NULL; rwlock_init(&et->lock); INIT_LIST_HEAD(&et->list); atomic_set(&et->node_cnt, 0); - atomic_inc(&sbi->total_ext_tree); + atomic_inc(&eti->total_ext_tree); } else { - atomic_dec(&sbi->total_zombie_tree); + atomic_dec(&eti->total_zombie_tree); list_del_init(&et->list); } - mutex_unlock(&sbi->extent_tree_lock); + mutex_unlock(&eti->extent_tree_lock); /* never died until evict_inode */ - F2FS_I(inode)->extent_tree = et; + F2FS_I(inode)->extent_tree[type] = et; return et; } @@ -415,35 +440,38 @@ static void __drop_largest_extent(struct extent_tree *et, } /* return true, if inode page is changed */ -static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage) +static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage, + enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree_info *eti = &sbi->extent_tree[type]; struct f2fs_extent *i_ext = ipage ? &F2FS_INODE(ipage)->i_ext : NULL; struct extent_tree *et; struct extent_node *en; struct extent_info ei; - if (!f2fs_may_extent_tree(inode)) { - /* drop largest extent */ - if (i_ext && i_ext->len) { + if (!__may_extent_tree(inode, type)) { + /* drop largest read extent */ + if (type == EX_READ && i_ext && i_ext->len) { f2fs_wait_on_page_writeback(ipage, NODE, true, true); i_ext->len = 0; set_page_dirty(ipage); - return; } - return; + goto out; } - et = __grab_extent_tree(inode); + et = __grab_extent_tree(inode, type); if (!i_ext || !i_ext->len) - return; + goto out; + + BUG_ON(type != EX_READ); get_read_extent_info(&ei, i_ext); write_lock(&et->lock); if (atomic_read(&et->node_cnt)) - goto out; + goto unlock_out; en = __attach_extent_node(sbi, et, &ei, NULL, &et->root.rb_root.rb_node, true); @@ -451,37 +479,40 @@ static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage) et->largest = en->ei; et->cached_en = en; - spin_lock(&sbi->extent_lock); - list_add_tail(&en->list, &sbi->extent_list); - spin_unlock(&sbi->extent_lock); + spin_lock(&eti->extent_lock); + list_add_tail(&en->list, &eti->extent_list); + spin_unlock(&eti->extent_lock); } -out: +unlock_out: write_unlock(&et->lock); +out: + if (type == EX_READ && !F2FS_I(inode)->extent_tree[EX_READ]) + set_inode_flag(inode, FI_NO_EXTENT); } void f2fs_init_extent_tree(struct inode *inode, struct page *ipage) { - __f2fs_init_extent_tree(inode, ipage); - - if (!F2FS_I(inode)->extent_tree) - set_inode_flag(inode, FI_NO_EXTENT); + /* initialize read cache */ + __f2fs_init_extent_tree(inode, ipage, EX_READ); } -static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, - struct extent_info *ei) +static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei, enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree_info *eti = &sbi->extent_tree[type]; + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; struct extent_node *en; bool ret = false; f2fs_bug_on(sbi, !et); - trace_f2fs_lookup_extent_tree_start(inode, pgofs); + trace_f2fs_lookup_extent_tree_start(inode, pgofs, type); read_lock(&et->lock); - if (et->largest.fofs <= pgofs && + if (type == EX_READ && + et->largest.fofs <= pgofs && et->largest.fofs + et->largest.len > pgofs) { *ei = et->largest; ret = true; @@ -495,23 +526,24 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, goto out; if (en == et->cached_en) - stat_inc_cached_node_hit(sbi); + stat_inc_cached_node_hit(sbi, type); else - stat_inc_rbtree_node_hit(sbi); + stat_inc_rbtree_node_hit(sbi, type); *ei = en->ei; - spin_lock(&sbi->extent_lock); + spin_lock(&eti->extent_lock); if (!list_empty(&en->list)) { - list_move_tail(&en->list, &sbi->extent_list); + list_move_tail(&en->list, &eti->extent_list); et->cached_en = en; } - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); ret = true; out: - stat_inc_total_hit(sbi); + stat_inc_total_hit(sbi, type); read_unlock(&et->lock); - trace_f2fs_lookup_extent_tree_end(inode, pgofs, ei); + if (type == EX_READ) + trace_f2fs_lookup_read_extent_tree_end(inode, pgofs, ei); return ret; } @@ -520,18 +552,20 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, struct extent_node *prev_ex, struct extent_node *next_ex) { + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; struct extent_node *en = NULL; - if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei)) { + if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei, et->type)) { prev_ex->ei.len += ei->len; ei = &prev_ex->ei; en = prev_ex; } - if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) { + if (next_ex && __is_front_mergeable(ei, &next_ex->ei, et->type)) { next_ex->ei.fofs = ei->fofs; - next_ex->ei.blk = ei->blk; next_ex->ei.len += ei->len; + if (et->type == EX_READ) + next_ex->ei.blk = ei->blk; if (en) __release_extent_node(sbi, et, prev_ex); @@ -543,12 +577,12 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, __try_update_largest_extent(et, en); - spin_lock(&sbi->extent_lock); + spin_lock(&eti->extent_lock); if (!list_empty(&en->list)) { - list_move_tail(&en->list, &sbi->extent_list); + list_move_tail(&en->list, &eti->extent_list); et->cached_en = en; } - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); return en; } @@ -558,6 +592,7 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, struct rb_node *insert_parent, bool leftmost) { + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; struct rb_node **p; struct rb_node *parent = NULL; struct extent_node *en = NULL; @@ -580,48 +615,51 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, __try_update_largest_extent(et, en); /* update in global extent list */ - spin_lock(&sbi->extent_lock); - list_add_tail(&en->list, &sbi->extent_list); + spin_lock(&eti->extent_lock); + list_add_tail(&en->list, &eti->extent_list); et->cached_en = en; - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); return en; } -static void f2fs_update_extent_tree_range(struct inode *inode, - pgoff_t fofs, block_t blkaddr, unsigned int len) +static void __update_extent_tree_range(struct inode *inode, + struct extent_info *tei, enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; struct extent_node *en = NULL, *en1 = NULL; struct extent_node *prev_en = NULL, *next_en = NULL; struct extent_info ei, dei, prev; struct rb_node **insert_p = NULL, *insert_parent = NULL; + unsigned int fofs = tei->fofs, len = tei->len; unsigned int end = fofs + len; - unsigned int pos = (unsigned int)fofs; bool updated = false; bool leftmost = false; if (!et) return; - trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len, 0); - + if (type == EX_READ) + trace_f2fs_update_read_extent_tree_range(inode, fofs, len, + tei->blk, 0); write_lock(&et->lock); - if (is_inode_flag_set(inode, FI_NO_EXTENT)) { - write_unlock(&et->lock); - return; + if (type == EX_READ) { + if (is_inode_flag_set(inode, FI_NO_EXTENT)) { + write_unlock(&et->lock); + return; + } + + prev = et->largest; + dei.len = 0; + + /* + * drop largest extent before lookup, in case it's already + * been shrunk from extent tree + */ + __drop_largest_extent(et, fofs, len); } - prev = et->largest; - dei.len = 0; - - /* - * drop largest extent before lookup, in case it's already - * been shrunk from extent tree - */ - __drop_largest_extent(et, fofs, len); - /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ en = (struct extent_node *)f2fs_lookup_rb_tree_ret(&et->root, (struct rb_entry *)et->cached_en, fofs, @@ -641,26 +679,30 @@ static void f2fs_update_extent_tree_range(struct inode *inode, dei = en->ei; org_end = dei.fofs + dei.len; - f2fs_bug_on(sbi, pos >= org_end); + f2fs_bug_on(sbi, fofs >= org_end); - if (pos > dei.fofs && pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) { - en->ei.len = pos - en->ei.fofs; + if (fofs > dei.fofs && (type != EX_READ || + fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN)) { + en->ei.len = fofs - en->ei.fofs; prev_en = en; parts = 1; } - if (end < org_end && org_end - end >= F2FS_MIN_EXTENT_LEN) { + if (end < org_end && (type != EX_READ || + org_end - end >= F2FS_MIN_EXTENT_LEN)) { if (parts) { __set_extent_info(&ei, end, org_end - end, - end - dei.fofs + dei.blk, false); + end - dei.fofs + dei.blk, false, + type); en1 = __insert_extent_tree(sbi, et, &ei, NULL, NULL, true); next_en = en1; } else { __set_extent_info(&en->ei, end, en->ei.len - (end - dei.fofs), - en->ei.blk + (end - dei.fofs), true); + en->ei.blk + (end - dei.fofs), true, + type); next_en = en; } parts++; @@ -690,9 +732,11 @@ static void f2fs_update_extent_tree_range(struct inode *inode, en = next_en; } - /* 3. update extent in extent cache */ - if (blkaddr) { - __set_extent_info(&ei, fofs, len, blkaddr, false); + /* 3. update extent in read extent cache */ + BUG_ON(type != EX_READ); + + if (tei->blk) { + __set_extent_info(&ei, fofs, len, tei->blk, false, EX_READ); if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) __insert_extent_tree(sbi, et, &ei, insert_p, insert_parent, leftmost); @@ -722,19 +766,20 @@ static void f2fs_update_extent_tree_range(struct inode *inode, } #ifdef CONFIG_F2FS_FS_COMPRESSION -void f2fs_update_extent_tree_range_compressed(struct inode *inode, +void f2fs_update_read_extent_tree_range_compressed(struct inode *inode, pgoff_t fofs, block_t blkaddr, unsigned int llen, unsigned int c_len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree *et = F2FS_I(inode)->extent_tree[EX_READ]; struct extent_node *en = NULL; struct extent_node *prev_en = NULL, *next_en = NULL; struct extent_info ei; struct rb_node **insert_p = NULL, *insert_parent = NULL; bool leftmost = false; - trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, llen, c_len); + trace_f2fs_update_read_extent_tree_range(inode, fofs, llen, + blkaddr, c_len); /* it is safe here to check FI_NO_EXTENT w/o et->lock in ro image */ if (is_inode_flag_set(inode, FI_NO_EXTENT)) @@ -751,7 +796,7 @@ void f2fs_update_extent_tree_range_compressed(struct inode *inode, if (en) goto unlock_out; - __set_extent_info(&ei, fofs, llen, blkaddr, true); + __set_extent_info(&ei, fofs, llen, blkaddr, true, EX_READ); ei.c_len = c_len; if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) @@ -762,24 +807,43 @@ void f2fs_update_extent_tree_range_compressed(struct inode *inode, } #endif -unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) +static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type type) { + struct extent_info ei; + + if (!__may_extent_tree(dn->inode, type)) + return; + + ei.fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + + dn->ofs_in_node; + ei.len = 1; + + if (type == EX_READ) { + if (dn->data_blkaddr == NEW_ADDR) + ei.blk = NULL_ADDR; + else + ei.blk = dn->data_blkaddr; + } + __update_extent_tree_range(dn->inode, &ei, type); +} + +static unsigned int __shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink, + enum extent_type type) +{ + struct extent_tree_info *eti = &sbi->extent_tree[type]; struct extent_tree *et, *next; struct extent_node *en; unsigned int node_cnt = 0, tree_cnt = 0; int remained; - if (!test_opt(sbi, READ_EXTENT_CACHE)) - return 0; - - if (!atomic_read(&sbi->total_zombie_tree)) + if (!atomic_read(&eti->total_zombie_tree)) goto free_node; - if (!mutex_trylock(&sbi->extent_tree_lock)) + if (!mutex_trylock(&eti->extent_tree_lock)) goto out; /* 1. remove unreferenced extent tree */ - list_for_each_entry_safe(et, next, &sbi->zombie_list, list) { + list_for_each_entry_safe(et, next, &eti->zombie_list, list) { if (atomic_read(&et->node_cnt)) { write_lock(&et->lock); node_cnt += __free_extent_tree(sbi, et); @@ -787,61 +851,100 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) } f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); list_del_init(&et->list); - radix_tree_delete(&sbi->extent_tree_root, et->ino); + radix_tree_delete(&eti->extent_tree_root, et->ino); kmem_cache_free(extent_tree_slab, et); - atomic_dec(&sbi->total_ext_tree); - atomic_dec(&sbi->total_zombie_tree); + atomic_dec(&eti->total_ext_tree); + atomic_dec(&eti->total_zombie_tree); tree_cnt++; if (node_cnt + tree_cnt >= nr_shrink) goto unlock_out; cond_resched(); } - mutex_unlock(&sbi->extent_tree_lock); + mutex_unlock(&eti->extent_tree_lock); free_node: /* 2. remove LRU extent entries */ - if (!mutex_trylock(&sbi->extent_tree_lock)) + if (!mutex_trylock(&eti->extent_tree_lock)) goto out; remained = nr_shrink - (node_cnt + tree_cnt); - spin_lock(&sbi->extent_lock); + spin_lock(&eti->extent_lock); for (; remained > 0; remained--) { - if (list_empty(&sbi->extent_list)) + if (list_empty(&eti->extent_list)) break; - en = list_first_entry(&sbi->extent_list, + en = list_first_entry(&eti->extent_list, struct extent_node, list); et = en->et; if (!write_trylock(&et->lock)) { /* refresh this extent node's position in extent list */ - list_move_tail(&en->list, &sbi->extent_list); + list_move_tail(&en->list, &eti->extent_list); continue; } list_del_init(&en->list); - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); __detach_extent_node(sbi, et, en); write_unlock(&et->lock); node_cnt++; - spin_lock(&sbi->extent_lock); + spin_lock(&eti->extent_lock); } - spin_unlock(&sbi->extent_lock); + spin_unlock(&eti->extent_lock); unlock_out: - mutex_unlock(&sbi->extent_tree_lock); + mutex_unlock(&eti->extent_tree_lock); out: - trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt); + trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt, type); return node_cnt + tree_cnt; } -unsigned int f2fs_destroy_extent_node(struct inode *inode) +/* read extent cache operations */ +bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei) +{ + if (!__may_extent_tree(inode, EX_READ)) + return false; + + return __lookup_extent_tree(inode, pgofs, ei, EX_READ); +} + +void f2fs_update_read_extent_cache(struct dnode_of_data *dn) +{ + return __update_extent_cache(dn, EX_READ); +} + +void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn, + pgoff_t fofs, block_t blkaddr, unsigned int len) +{ + struct extent_info ei = { + .fofs = fofs, + .len = len, + .blk = blkaddr, + }; + + if (!__may_extent_tree(dn->inode, EX_READ)) + return; + + __update_extent_tree_range(dn->inode, &ei, EX_READ); +} + +unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) +{ + if (!test_opt(sbi, READ_EXTENT_CACHE)) + return 0; + + return __shrink_extent_tree(sbi, nr_shrink, EX_READ); +} + +static unsigned int __destroy_extent_node(struct inode *inode, + enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; unsigned int node_cnt = 0; if (!et || !atomic_read(&et->node_cnt)) @@ -854,31 +957,44 @@ unsigned int f2fs_destroy_extent_node(struct inode *inode) return node_cnt; } -void f2fs_drop_extent_tree(struct inode *inode) +void f2fs_destroy_extent_node(struct inode *inode) +{ + __destroy_extent_node(inode, EX_READ); +} + +static void __drop_extent_tree(struct inode *inode, enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; bool updated = false; - if (!f2fs_may_extent_tree(inode)) + if (!__may_extent_tree(inode, type)) return; write_lock(&et->lock); - set_inode_flag(inode, FI_NO_EXTENT); __free_extent_tree(sbi, et); - if (et->largest.len) { - et->largest.len = 0; - updated = true; + if (type == EX_READ) { + set_inode_flag(inode, FI_NO_EXTENT); + if (et->largest.len) { + et->largest.len = 0; + updated = true; + } } write_unlock(&et->lock); if (updated) f2fs_mark_inode_dirty_sync(inode, true); } -void f2fs_destroy_extent_tree(struct inode *inode) +void f2fs_drop_extent_tree(struct inode *inode) +{ + __drop_extent_tree(inode, EX_READ); +} + +static void __destroy_extent_tree(struct inode *inode, enum extent_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree_info *eti = &sbi->extent_tree[type]; + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; unsigned int node_cnt = 0; if (!et) @@ -886,76 +1002,49 @@ void f2fs_destroy_extent_tree(struct inode *inode) if (inode->i_nlink && !is_bad_inode(inode) && atomic_read(&et->node_cnt)) { - mutex_lock(&sbi->extent_tree_lock); - list_add_tail(&et->list, &sbi->zombie_list); - atomic_inc(&sbi->total_zombie_tree); - mutex_unlock(&sbi->extent_tree_lock); + mutex_lock(&eti->extent_tree_lock); + list_add_tail(&et->list, &eti->zombie_list); + atomic_inc(&eti->total_zombie_tree); + mutex_unlock(&eti->extent_tree_lock); return; } /* free all extent info belong to this extent tree */ - node_cnt = f2fs_destroy_extent_node(inode); + node_cnt = __destroy_extent_node(inode, type); /* delete extent tree entry in radix tree */ - mutex_lock(&sbi->extent_tree_lock); + mutex_lock(&eti->extent_tree_lock); f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); - radix_tree_delete(&sbi->extent_tree_root, inode->i_ino); + radix_tree_delete(&eti->extent_tree_root, inode->i_ino); kmem_cache_free(extent_tree_slab, et); - atomic_dec(&sbi->total_ext_tree); - mutex_unlock(&sbi->extent_tree_lock); + atomic_dec(&eti->total_ext_tree); + mutex_unlock(&eti->extent_tree_lock); - F2FS_I(inode)->extent_tree = NULL; + F2FS_I(inode)->extent_tree[type] = NULL; - trace_f2fs_destroy_extent_tree(inode, node_cnt); + trace_f2fs_destroy_extent_tree(inode, node_cnt, type); } -bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, - struct extent_info *ei) +void f2fs_destroy_extent_tree(struct inode *inode) { - if (!f2fs_may_extent_tree(inode)) - return false; - - return f2fs_lookup_extent_tree(inode, pgofs, ei); + __destroy_extent_tree(inode, EX_READ); } -void f2fs_update_extent_cache(struct dnode_of_data *dn) +static void __init_extent_tree_info(struct extent_tree_info *eti) { - pgoff_t fofs; - block_t blkaddr; - - if (!f2fs_may_extent_tree(dn->inode)) - return; - - if (dn->data_blkaddr == NEW_ADDR) - blkaddr = NULL_ADDR; - else - blkaddr = dn->data_blkaddr; - - fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + - dn->ofs_in_node; - f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, 1); -} - -void f2fs_update_extent_cache_range(struct dnode_of_data *dn, - pgoff_t fofs, block_t blkaddr, unsigned int len) - -{ - if (!f2fs_may_extent_tree(dn->inode)) - return; - - f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len); + INIT_RADIX_TREE(&eti->extent_tree_root, GFP_NOIO); + mutex_init(&eti->extent_tree_lock); + INIT_LIST_HEAD(&eti->extent_list); + spin_lock_init(&eti->extent_lock); + atomic_set(&eti->total_ext_tree, 0); + INIT_LIST_HEAD(&eti->zombie_list); + atomic_set(&eti->total_zombie_tree, 0); + atomic_set(&eti->total_ext_node, 0); } void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi) { - INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO); - mutex_init(&sbi->extent_tree_lock); - INIT_LIST_HEAD(&sbi->extent_list); - spin_lock_init(&sbi->extent_lock); - atomic_set(&sbi->total_ext_tree, 0); - INIT_LIST_HEAD(&sbi->zombie_list); - atomic_set(&sbi->total_zombie_tree, 0); - atomic_set(&sbi->total_ext_node, 0); + __init_extent_tree_info(&sbi->extent_tree[EX_READ]); } int __init f2fs_create_extent_cache(void) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7032c38f736e..2ac0cc0f108e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -592,16 +592,22 @@ enum { /* dirty segments threshold for triggering CP */ #define DEFAULT_DIRTY_THRESHOLD 4 +#define RECOVERY_MAX_RA_BLOCKS BIO_MAX_PAGES +#define RECOVERY_MIN_RA_BLOCKS 1 + +#define F2FS_ONSTACK_PAGES 16 /* nr of onstack pages */ + /* for in-memory extent cache entry */ #define F2FS_MIN_EXTENT_LEN 64 /* minimum extent length */ /* number of extent info in extent cache we try to shrink */ #define READ_EXTENT_CACHE_SHRINK_NUMBER 128 -#define RECOVERY_MAX_RA_BLOCKS BIO_MAX_PAGES -#define RECOVERY_MIN_RA_BLOCKS 1 - -#define F2FS_ONSTACK_PAGES 16 /* nr of onstack pages */ +/* extent cache type */ +enum extent_type { + EX_READ, + NR_EXTENT_CACHES, +}; struct rb_entry { struct rb_node rb_node; /* rb node located in rb-tree */ @@ -617,10 +623,17 @@ struct rb_entry { struct extent_info { unsigned int fofs; /* start offset in a file */ unsigned int len; /* length of the extent */ - block_t blk; /* start block address of the extent */ + union { + /* read extent_cache */ + struct { + /* start block address of the extent */ + block_t blk; #ifdef CONFIG_F2FS_FS_COMPRESSION - unsigned int c_len; /* physical extent length of compressed blocks */ + /* physical extent length of compressed blocks */ + unsigned int c_len; #endif + }; + }; }; struct extent_node { @@ -632,13 +645,25 @@ struct extent_node { struct extent_tree { nid_t ino; /* inode number */ + enum extent_type type; /* keep the extent tree type */ struct rb_root_cached root; /* root of extent info rb-tree */ struct extent_node *cached_en; /* recently accessed extent node */ - struct extent_info largest; /* largested extent info */ struct list_head list; /* to be used by sbi->zombie_list */ rwlock_t lock; /* protect extent info rb-tree */ atomic_t node_cnt; /* # of extent node in rb-tree*/ bool largest_updated; /* largest extent updated */ + struct extent_info largest; /* largest cached extent for EX_READ */ +}; + +struct extent_tree_info { + struct radix_tree_root extent_tree_root;/* cache extent cache entries */ + struct mutex extent_tree_lock; /* locking extent radix tree */ + struct list_head extent_list; /* lru list for shrinker */ + spinlock_t extent_lock; /* locking extent lru list */ + atomic_t total_ext_tree; /* extent tree count */ + struct list_head zombie_list; /* extent zombie tree list */ + atomic_t total_zombie_tree; /* extent zombie tree count */ + atomic_t total_ext_node; /* extent info count */ }; /* @@ -801,7 +826,8 @@ struct f2fs_inode_info { struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ struct task_struct *atomic_write_task; /* store atomic write task */ - struct extent_tree *extent_tree; /* cached extent_tree entry */ + struct extent_tree *extent_tree[NR_EXTENT_CACHES]; + /* cached extent_tree entry */ pgoff_t ra_offset; /* ongoing readahead offset */ struct inode *cow_inode; /* copy-on-write inode for atomic write */ @@ -1625,14 +1651,7 @@ struct f2fs_sb_info { struct mutex flush_lock; /* for flush exclusion */ /* for extent tree cache */ - struct radix_tree_root extent_tree_root;/* cache extent cache entries */ - struct mutex extent_tree_lock; /* locking extent radix tree */ - struct list_head extent_list; /* lru list for shrinker */ - spinlock_t extent_lock; /* locking extent lru list */ - atomic_t total_ext_tree; /* extent tree count */ - struct list_head zombie_list; /* extent zombie tree list */ - atomic_t total_zombie_tree; /* extent zombie tree count */ - atomic_t total_ext_node; /* extent info count */ + struct extent_tree_info extent_tree[NR_EXTENT_CACHES]; /* basic filesystem units */ unsigned int log_sectors_per_block; /* log2 sectors per block */ @@ -1717,10 +1736,14 @@ struct f2fs_sb_info { unsigned int segment_count[2]; /* # of allocated segments */ unsigned int block_count[2]; /* # of allocated blocks */ atomic_t inplace_count; /* # of inplace update */ - atomic64_t total_hit_ext; /* # of lookup extent cache */ - atomic64_t read_hit_rbtree; /* # of hit rbtree extent node */ - atomic64_t read_hit_largest; /* # of hit largest extent node */ - atomic64_t read_hit_cached; /* # of hit cached extent node */ + /* # of lookup extent cache */ + atomic64_t total_hit_ext[NR_EXTENT_CACHES]; + /* # of hit rbtree extent node */ + atomic64_t read_hit_rbtree[NR_EXTENT_CACHES]; + /* # of hit cached extent node */ + atomic64_t read_hit_cached[NR_EXTENT_CACHES]; + /* # of hit largest extent node in read extent cache */ + atomic64_t read_hit_largest; atomic_t inline_xattr; /* # of inline_xattr inodes */ atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ @@ -3867,9 +3890,17 @@ struct f2fs_stat_info { struct f2fs_sb_info *sbi; int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs; int main_area_segs, main_area_sections, main_area_zones; - unsigned long long hit_largest, hit_cached, hit_rbtree; - unsigned long long hit_total, total_ext; - int ext_tree, zombie_tree, ext_node; + unsigned long long hit_cached[NR_EXTENT_CACHES]; + unsigned long long hit_rbtree[NR_EXTENT_CACHES]; + unsigned long long total_ext[NR_EXTENT_CACHES]; + unsigned long long hit_total[NR_EXTENT_CACHES]; + int ext_tree[NR_EXTENT_CACHES]; + int zombie_tree[NR_EXTENT_CACHES]; + int ext_node[NR_EXTENT_CACHES]; + /* to count memory footprint */ + unsigned long long ext_mem[NR_EXTENT_CACHES]; + /* for read extent cache */ + unsigned long long hit_largest; int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta; int ndirty_data, ndirty_qdata; unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all; @@ -3928,10 +3959,10 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) #define stat_other_skip_bggc_count(sbi) ((sbi)->other_skip_bggc++) #define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++) #define stat_dec_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]--) -#define stat_inc_total_hit(sbi) (atomic64_inc(&(sbi)->total_hit_ext)) -#define stat_inc_rbtree_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_rbtree)) +#define stat_inc_total_hit(sbi, type) (atomic64_inc(&(sbi)->total_hit_ext[type])) +#define stat_inc_rbtree_node_hit(sbi, type) (atomic64_inc(&(sbi)->read_hit_rbtree[type])) #define stat_inc_largest_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_largest)) -#define stat_inc_cached_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_cached)) +#define stat_inc_cached_node_hit(sbi, type) (atomic64_inc(&(sbi)->read_hit_cached[type])) #define stat_inc_inline_xattr(inode) \ do { \ if (f2fs_has_inline_xattr(inode)) \ @@ -4054,10 +4085,10 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #define stat_other_skip_bggc_count(sbi) do { } while (0) #define stat_inc_dirty_inode(sbi, type) do { } while (0) #define stat_dec_dirty_inode(sbi, type) do { } while (0) -#define stat_inc_total_hit(sbi) do { } while (0) -#define stat_inc_rbtree_node_hit(sbi) do { } while (0) +#define stat_inc_total_hit(sbi, type) do { } while (0) +#define stat_inc_rbtree_node_hit(sbi, type) do { } while (0) #define stat_inc_largest_node_hit(sbi) do { } while (0) -#define stat_inc_cached_node_hit(sbi) do { } while (0) +#define stat_inc_cached_node_hit(sbi, type) do { } while (0) #define stat_inc_inline_xattr(inode) do { } while (0) #define stat_dec_inline_xattr(inode) do { } while (0) #define stat_inc_inline_inode(inode) do { } while (0) @@ -4163,20 +4194,23 @@ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, bool force, bool *leftmost); bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, struct rb_root_cached *root, bool check_key); -unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); void f2fs_init_extent_tree(struct inode *inode, struct page *ipage); void f2fs_drop_extent_tree(struct inode *inode); -unsigned int f2fs_destroy_extent_node(struct inode *inode); +void f2fs_destroy_extent_node(struct inode *inode); void f2fs_destroy_extent_tree(struct inode *inode); -bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, - struct extent_info *ei); -void f2fs_update_extent_cache(struct dnode_of_data *dn); -void f2fs_update_extent_cache_range(struct dnode_of_data *dn, - pgoff_t fofs, block_t blkaddr, unsigned int len); void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi); int __init f2fs_create_extent_cache(void); void f2fs_destroy_extent_cache(void); +/* read extent cache ops */ +bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei); +void f2fs_update_read_extent_cache(struct dnode_of_data *dn); +void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn, + pgoff_t fofs, block_t blkaddr, unsigned int len); +unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi, + int nr_shrink); + /* * sysfs.c */ @@ -4243,9 +4277,9 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, struct writeback_control *wbc, enum iostat_type io_type); int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index); -void f2fs_update_extent_tree_range_compressed(struct inode *inode, - pgoff_t fofs, block_t blkaddr, unsigned int llen, - unsigned int c_len); +void f2fs_update_read_extent_tree_range_compressed(struct inode *inode, + pgoff_t fofs, block_t blkaddr, + unsigned int llen, unsigned int c_len); int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, unsigned nr_pages, sector_t *last_block_in_bio, bool is_readahead, bool for_write); @@ -4326,9 +4360,10 @@ static inline bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, static inline void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) { } #define inc_compr_inode_stat(inode) do { } while (0) -static inline void f2fs_update_extent_tree_range_compressed(struct inode *inode, - pgoff_t fofs, block_t blkaddr, unsigned int llen, - unsigned int c_len) { } +static inline void f2fs_update_read_extent_tree_range_compressed( + struct inode *inode, + pgoff_t fofs, block_t blkaddr, + unsigned int llen, unsigned int c_len) { } #endif static inline int set_compress_context(struct inode *inode) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 4828e4ca0d3f..662dedd02732 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -637,7 +637,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) */ fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + ofs; - f2fs_update_extent_cache_range(dn, fofs, 0, len); + f2fs_update_read_extent_cache_range(dn, fofs, 0, len); dec_valid_block_count(sbi, dn->inode, nr_free); } dn->ofs_in_node = ofs; @@ -1473,7 +1473,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, f2fs_set_data_blkaddr(dn); } - f2fs_update_extent_cache_range(dn, start, 0, index - start); + f2fs_update_read_extent_cache_range(dn, start, 0, index - start); return ret; } @@ -2595,7 +2595,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, struct f2fs_map_blocks map = { .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE, .m_may_create = false }; - struct extent_info ei = {0, 0, 0}; + struct extent_info ei = {0, }; pgoff_t pg_start, pg_end, next_pgofs; unsigned int blk_per_seg = sbi->blocks_per_seg; unsigned int total = 0, sec_num; @@ -2627,7 +2627,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, * lookup mapping info in extent cache, skip defragmenting if physical * block addresses are continuous. */ - if (f2fs_lookup_extent_cache(inode, pg_start, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, pg_start, &ei)) { if (ei.fofs + ei.len >= pg_end) goto out; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index ca57bd4fb784..30673f49ea54 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1146,7 +1146,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index) struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; struct page *page; - struct extent_info ei = {0, 0, 0}; + struct extent_info ei = {0, }; struct f2fs_io_info fio = { .sbi = sbi, .ino = inode->i_ino, @@ -1164,7 +1164,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index) if (!page) return -ENOMEM; - if (f2fs_lookup_extent_cache(inode, index, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { dn.data_blkaddr = ei.blk + index - ei.fofs; if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, DATA_GENERIC_ENHANCE_READ))) { diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 4207d4c2081a..0f0759a708f6 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -262,8 +262,8 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } - if (fi->extent_tree) { - struct extent_info *ei = &fi->extent_tree->largest; + if (fi->extent_tree[EX_READ]) { + struct extent_info *ei = &fi->extent_tree[EX_READ]->largest; if (ei->len && (!f2fs_is_valid_blkaddr(sbi, ei->blk, @@ -607,7 +607,7 @@ struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino) void f2fs_update_inode(struct inode *inode, struct page *node_page) { struct f2fs_inode *ri; - struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_tree *et = F2FS_I(inode)->extent_tree[EX_READ]; f2fs_wait_on_page_writeback(node_page, NODE, true, true); set_page_dirty(node_page); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 4f60b19e5fb2..1dcc3ef3d2e7 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -86,9 +86,11 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) mem_size >>= PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else if (type == READ_EXTENT_CACHE) { - mem_size = (atomic_read(&sbi->total_ext_tree) * + struct extent_tree_info *eti = &sbi->extent_tree[EX_READ]; + + mem_size = (atomic_read(&eti->total_ext_tree) * sizeof(struct extent_tree) + - atomic_read(&sbi->total_ext_node) * + atomic_read(&eti->total_ext_node) * sizeof(struct extent_node)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else if (type == DISCARD_CACHE) { @@ -857,7 +859,7 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) blkaddr = data_blkaddr(dn->inode, dn->node_page, dn->ofs_in_node + 1); - f2fs_update_extent_tree_range_compressed(dn->inode, + f2fs_update_read_extent_tree_range_compressed(dn->inode, index, blkaddr, F2FS_I(dn->inode)->i_cluster_size, c_len); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1ea2c253aae7..fdba5160f8d4 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -449,7 +449,8 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) /* try to shrink extent cache when there is no enough memory */ if (!f2fs_available_free_memory(sbi, READ_EXTENT_CACHE)) - f2fs_shrink_extent_tree(sbi, READ_EXTENT_CACHE_SHRINK_NUMBER); + f2fs_shrink_read_extent_tree(sbi, + READ_EXTENT_CACHE_SHRINK_NUMBER); /* check the # of cached NAT entries */ if (!f2fs_available_free_memory(sbi, NAT_ENTRIES)) diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index dd3c3c7a90ec..33c490e69ae3 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -28,10 +28,13 @@ static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) return count > 0 ? count : 0; } -static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi) +static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi, + enum extent_type type) { - return atomic_read(&sbi->total_zombie_tree) + - atomic_read(&sbi->total_ext_node); + struct extent_tree_info *eti = &sbi->extent_tree[type]; + + return atomic_read(&eti->total_zombie_tree) + + atomic_read(&eti->total_ext_node); } unsigned long f2fs_shrink_count(struct shrinker *shrink, @@ -53,8 +56,8 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink, } spin_unlock(&f2fs_list_lock); - /* count extent cache entries */ - count += __count_extent_cache(sbi); + /* count read extent cache entries */ + count += __count_extent_cache(sbi, EX_READ); /* count clean nat cache entries */ count += __count_nat_entries(sbi); @@ -99,8 +102,8 @@ unsigned long f2fs_shrink_scan(struct shrinker *shrink, sbi->shrinker_run_no = run_no; - /* shrink extent cache entries */ - freed += f2fs_shrink_extent_tree(sbi, nr >> 1); + /* shrink read extent cache entries */ + freed += f2fs_shrink_read_extent_tree(sbi, nr >> 1); /* shrink clean nat cache entries */ if (freed < nr) @@ -130,7 +133,7 @@ void f2fs_join_shrinker(struct f2fs_sb_info *sbi) void f2fs_leave_shrinker(struct f2fs_sb_info *sbi) { - f2fs_shrink_extent_tree(sbi, __count_extent_cache(sbi)); + f2fs_shrink_read_extent_tree(sbi, __count_extent_cache(sbi, EX_READ)); spin_lock(&f2fs_list_lock); list_del_init(&sbi->s_list); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index da8147c649cd..d772cd2061cf 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -48,6 +48,7 @@ TRACE_DEFINE_ENUM(CP_DISCARD); TRACE_DEFINE_ENUM(CP_TRIMMED); TRACE_DEFINE_ENUM(CP_PAUSE); TRACE_DEFINE_ENUM(CP_RESIZE); +TRACE_DEFINE_ENUM(EX_READ); #define show_block_type(type) \ __print_symbolic(type, \ @@ -1529,28 +1530,31 @@ TRACE_EVENT(f2fs_issue_flush, TRACE_EVENT(f2fs_lookup_extent_tree_start, - TP_PROTO(struct inode *inode, unsigned int pgofs), + TP_PROTO(struct inode *inode, unsigned int pgofs, enum extent_type type), - TP_ARGS(inode, pgofs), + TP_ARGS(inode, pgofs, type), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(unsigned int, pgofs) + __field(enum extent_type, type) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pgofs = pgofs; + __entry->type = type; ), - TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u", + TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, type = %s", show_dev_ino(__entry), - __entry->pgofs) + __entry->pgofs, + __entry->type == EX_READ ? "Read" : "N/A") ); -TRACE_EVENT_CONDITION(f2fs_lookup_extent_tree_end, +TRACE_EVENT_CONDITION(f2fs_lookup_read_extent_tree_end, TP_PROTO(struct inode *inode, unsigned int pgofs, struct extent_info *ei), @@ -1564,8 +1568,8 @@ TRACE_EVENT_CONDITION(f2fs_lookup_extent_tree_end, __field(ino_t, ino) __field(unsigned int, pgofs) __field(unsigned int, fofs) - __field(u32, blk) __field(unsigned int, len) + __field(u32, blk) ), TP_fast_assign( @@ -1573,26 +1577,26 @@ TRACE_EVENT_CONDITION(f2fs_lookup_extent_tree_end, __entry->ino = inode->i_ino; __entry->pgofs = pgofs; __entry->fofs = ei->fofs; - __entry->blk = ei->blk; __entry->len = ei->len; + __entry->blk = ei->blk; ), TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, " - "ext_info(fofs: %u, blk: %u, len: %u)", + "read_ext_info(fofs: %u, len: %u, blk: %u)", show_dev_ino(__entry), __entry->pgofs, __entry->fofs, - __entry->blk, - __entry->len) + __entry->len, + __entry->blk) ); -TRACE_EVENT(f2fs_update_extent_tree_range, +TRACE_EVENT(f2fs_update_read_extent_tree_range, - TP_PROTO(struct inode *inode, unsigned int pgofs, block_t blkaddr, - unsigned int len, + TP_PROTO(struct inode *inode, unsigned int pgofs, unsigned int len, + block_t blkaddr, unsigned int c_len), - TP_ARGS(inode, pgofs, blkaddr, len, c_len), + TP_ARGS(inode, pgofs, len, blkaddr, c_len), TP_STRUCT__entry( __field(dev_t, dev) @@ -1607,67 +1611,73 @@ TRACE_EVENT(f2fs_update_extent_tree_range, __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pgofs = pgofs; - __entry->blk = blkaddr; __entry->len = len; + __entry->blk = blkaddr; __entry->c_len = c_len; ), TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, " - "blkaddr = %u, len = %u, " - "c_len = %u", + "len = %u, blkaddr = %u, c_len = %u", show_dev_ino(__entry), __entry->pgofs, - __entry->blk, __entry->len, + __entry->blk, __entry->c_len) ); TRACE_EVENT(f2fs_shrink_extent_tree, TP_PROTO(struct f2fs_sb_info *sbi, unsigned int node_cnt, - unsigned int tree_cnt), + unsigned int tree_cnt, enum extent_type type), - TP_ARGS(sbi, node_cnt, tree_cnt), + TP_ARGS(sbi, node_cnt, tree_cnt, type), TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned int, node_cnt) __field(unsigned int, tree_cnt) + __field(enum extent_type, type) ), TP_fast_assign( __entry->dev = sbi->sb->s_dev; __entry->node_cnt = node_cnt; __entry->tree_cnt = tree_cnt; + __entry->type = type; ), - TP_printk("dev = (%d,%d), shrunk: node_cnt = %u, tree_cnt = %u", + TP_printk("dev = (%d,%d), shrunk: node_cnt = %u, tree_cnt = %u, type = %s", show_dev(__entry->dev), __entry->node_cnt, - __entry->tree_cnt) + __entry->tree_cnt, + __entry->type == EX_READ ? "Read" : "N/A") ); TRACE_EVENT(f2fs_destroy_extent_tree, - TP_PROTO(struct inode *inode, unsigned int node_cnt), + TP_PROTO(struct inode *inode, unsigned int node_cnt, + enum extent_type type), - TP_ARGS(inode, node_cnt), + TP_ARGS(inode, node_cnt, type), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(unsigned int, node_cnt) + __field(enum extent_type, type) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->node_cnt = node_cnt; + __entry->type = type; ), - TP_printk("dev = (%d,%d), ino = %lu, destroyed: node_cnt = %u", + TP_printk("dev = (%d,%d), ino = %lu, destroyed: node_cnt = %u, type = %s", show_dev_ino(__entry), - __entry->node_cnt) + __entry->node_cnt, + __entry->type == EX_READ ? "Read" : "N/A") ); DECLARE_EVENT_CLASS(f2fs_sync_dirty_inodes, From 9df4ebe032e15d16181c6c8ed8dc66bf27046664 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 2 Dec 2022 13:51:09 -0800 Subject: [PATCH 553/580] f2fs: allocate the extent_cache by default Let's allocate it to remove the runtime complexity. Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 38 +++++++++++++++++++------------------- fs/f2fs/f2fs.h | 3 ++- fs/f2fs/inode.c | 6 ++++-- fs/f2fs/namei.c | 4 ++-- 4 files changed, 27 insertions(+), 24 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 654a14ab8977..305f969e3ad1 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -47,20 +47,23 @@ static bool __may_read_extent_tree(struct inode *inode) return S_ISREG(inode->i_mode); } +static bool __init_may_extent_tree(struct inode *inode, enum extent_type type) +{ + if (type == EX_READ) + return __may_read_extent_tree(inode); + return false; +} + static bool __may_extent_tree(struct inode *inode, enum extent_type type) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - /* * for recovered files during mount do not create extents * if shrinker is not registered. */ - if (list_empty(&sbi->s_list)) + if (list_empty(&F2FS_I_SB(inode)->s_list)) return false; - if (type == EX_READ) - return __may_read_extent_tree(inode); - return false; + return __init_may_extent_tree(inode, type); } static void __try_update_largest_extent(struct extent_tree *et, @@ -439,20 +442,18 @@ static void __drop_largest_extent(struct extent_tree *et, } } -/* return true, if inode page is changed */ -static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage, - enum extent_type type) +void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree_info *eti = &sbi->extent_tree[type]; - struct f2fs_extent *i_ext = ipage ? &F2FS_INODE(ipage)->i_ext : NULL; + struct extent_tree_info *eti = &sbi->extent_tree[EX_READ]; + struct f2fs_extent *i_ext = &F2FS_INODE(ipage)->i_ext; struct extent_tree *et; struct extent_node *en; struct extent_info ei; - if (!__may_extent_tree(inode, type)) { + if (!__may_extent_tree(inode, EX_READ)) { /* drop largest read extent */ - if (type == EX_READ && i_ext && i_ext->len) { + if (i_ext && i_ext->len) { f2fs_wait_on_page_writeback(ipage, NODE, true, true); i_ext->len = 0; set_page_dirty(ipage); @@ -460,13 +461,11 @@ static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage, goto out; } - et = __grab_extent_tree(inode, type); + et = __grab_extent_tree(inode, EX_READ); if (!i_ext || !i_ext->len) goto out; - BUG_ON(type != EX_READ); - get_read_extent_info(&ei, i_ext); write_lock(&et->lock); @@ -486,14 +485,15 @@ static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage, unlock_out: write_unlock(&et->lock); out: - if (type == EX_READ && !F2FS_I(inode)->extent_tree[EX_READ]) + if (!F2FS_I(inode)->extent_tree[EX_READ]) set_inode_flag(inode, FI_NO_EXTENT); } -void f2fs_init_extent_tree(struct inode *inode, struct page *ipage) +void f2fs_init_extent_tree(struct inode *inode) { /* initialize read cache */ - __f2fs_init_extent_tree(inode, ipage, EX_READ); + if (__init_may_extent_tree(inode, EX_READ)) + __grab_extent_tree(inode, EX_READ); } static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2ac0cc0f108e..bb9bad7b7323 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4194,7 +4194,7 @@ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, bool force, bool *leftmost); bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, struct rb_root_cached *root, bool check_key); -void f2fs_init_extent_tree(struct inode *inode, struct page *ipage); +void f2fs_init_extent_tree(struct inode *inode); void f2fs_drop_extent_tree(struct inode *inode); void f2fs_destroy_extent_node(struct inode *inode); void f2fs_destroy_extent_tree(struct inode *inode); @@ -4203,6 +4203,7 @@ int __init f2fs_create_extent_cache(void); void f2fs_destroy_extent_cache(void); /* read extent cache ops */ +void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage); bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs, struct extent_info *ei); void f2fs_update_read_extent_cache(struct dnode_of_data *dn); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 0f0759a708f6..0266e111d1a6 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -392,8 +392,6 @@ static int do_read_inode(struct inode *inode) fi->i_pino = le32_to_cpu(ri->i_pino); fi->i_dir_level = ri->i_dir_level; - f2fs_init_extent_tree(inode, node_page); - get_inline_info(inode, ri); fi->i_extra_isize = f2fs_has_extra_attr(inode) ? @@ -479,6 +477,10 @@ static int do_read_inode(struct inode *inode) } init_idisk_time(inode); + + /* Need all the flag bits */ + f2fs_init_read_extent_tree(inode, node_page); + f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index ff4d79110fdc..555fcdc380a9 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -278,8 +278,6 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode, } F2FS_I(inode)->i_inline_xattr_size = xattr_size; - f2fs_init_extent_tree(inode, NULL); - F2FS_I(inode)->i_flags = f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED); @@ -305,6 +303,8 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode, f2fs_set_inode_flags(inode); + f2fs_init_extent_tree(inode); + trace_f2fs_new_inode(inode, 0); return inode; From dfcfcc6d16b926fdf19d95f8e861c717ad3cf62b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 1 Dec 2022 17:37:15 -0800 Subject: [PATCH 554/580] f2fs: add block_age-based extent cache This patch introduces a runtime hot/cold data separation method for f2fs, in order to improve the accuracy for data temperature classification, reduce the garbage collection overhead after long-term data updates. Enhanced hot/cold data separation can record data block update frequency as "age" of the extent per inode, and take use of the age info to indicate better temperature type for data block allocation: - It records total data blocks allocated since mount; - When file extent has been updated, it calculate the count of data blocks allocated since last update as the age of the extent; - Before the data block allocated, it searches for the age info and chooses the suitable segment for allocation. Test and result: - Prepare: create about 30000 files * 3% for cold files (with cold file extension like .apk, from 3M to 10M) * 50% for warm files (with random file extension like .FcDxq, from 1K to 4M) * 47% for hot files (with hot file extension like .db, from 1K to 256K) - create(5%)/random update(90%)/delete(5%) the files * total write amount is about 70G * fsync will be called for .db files, and buffered write will be used for other files The storage of test device is large enough(128G) so that it will not switch to SSR mode during the test. Benefit: dirty segment count increment reduce about 14% - before: Dirty +21110 - after: Dirty +18286 Signed-off-by: qixiaoyu1 Signed-off-by: xiongping1 Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 14 ++ Documentation/filesystems/f2fs.rst | 4 + fs/f2fs/debug.c | 21 +++ fs/f2fs/extent_cache.c | 183 +++++++++++++++++++++++- fs/f2fs/f2fs.h | 38 +++++ fs/f2fs/file.c | 1 + fs/f2fs/inode.c | 1 + fs/f2fs/node.c | 10 +- fs/f2fs/node.h | 1 + fs/f2fs/segment.c | 33 +++++ fs/f2fs/shrinker.c | 10 +- fs/f2fs/super.c | 14 ++ fs/f2fs/sysfs.c | 24 ++++ include/trace/events/f2fs.h | 86 ++++++++++- 14 files changed, 430 insertions(+), 10 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index fc0704ff13bd..ae2ab41e07cb 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -630,3 +630,17 @@ Description: When space utilization exceeds this, do background DISCARD aggressi Does DISCARD forcibly in a period of given min_discard_issue_time when the number of discards is not 0 and set discard granularity to 1. Default: 80 + +What: /sys/fs/f2fs//hot_data_age_threshold +Date: November 2022 +Contact: "Ping Xiong" +Description: When DATA SEPARATION is on, it controls the age threshold to indicate + the data blocks as hot. By default it was initialized as 262144 blocks + (equals to 1GB). + +What: /sys/fs/f2fs//warm_data_age_threshold +Date: November 2022 +Contact: "Ping Xiong" +Description: When DATA SEPARATION is on, it controls the age threshold to indicate + the data blocks as warm. By default it was initialized as 2621440 blocks + (equals to 10GB). diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 6fd72befee71..b725d8190bef 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -348,6 +348,10 @@ memory=%s Control memory mode. This supports "normal" and "low" modes. Because of the nature of low memory devices, in this mode, f2fs will try to save memory sometimes by sacrificing performance. "normal" mode is the default mode and same as before. +age_extent_cache Enable an age extent cache based on rb-tree. It records + data block update frequency of the extent per inode, in + order to provide better temperature hints for data block + allocation. ======================== ============================================================ Debugfs Entries diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index a9baa121d829..8f1ef742551f 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -88,6 +88,9 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->hit_largest = atomic64_read(&sbi->read_hit_largest); si->hit_total[EX_READ] += si->hit_largest; + /* block age extent_cache only */ + si->allocated_data_blocks = atomic64_read(&sbi->allocated_data_blocks); + /* validation check of the segment numbers */ si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS); @@ -516,6 +519,22 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree[EX_READ], si->zombie_tree[EX_READ], si->ext_node[EX_READ]); + seq_puts(s, "\nExtent Cache (Block Age):\n"); + seq_printf(s, " - Allocated Data Blocks: %llu\n", + si->allocated_data_blocks); + seq_printf(s, " - Hit Count: L1:%llu L2:%llu\n", + si->hit_cached[EX_BLOCK_AGE], + si->hit_rbtree[EX_BLOCK_AGE]); + seq_printf(s, " - Hit Ratio: %llu%% (%llu / %llu)\n", + !si->total_ext[EX_BLOCK_AGE] ? 0 : + div64_u64(si->hit_total[EX_BLOCK_AGE] * 100, + si->total_ext[EX_BLOCK_AGE]), + si->hit_total[EX_BLOCK_AGE], + si->total_ext[EX_BLOCK_AGE]); + seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", + si->ext_tree[EX_BLOCK_AGE], + si->zombie_tree[EX_BLOCK_AGE], + si->ext_node[EX_BLOCK_AGE]); seq_puts(s, "\nBalancing F2FS Async:\n"); seq_printf(s, " - DIO (R: %4d, W: %4d)\n", si->nr_dio_read, si->nr_dio_write); @@ -586,6 +605,8 @@ static int stat_show(struct seq_file *s, void *v) si->cache_mem >> 10); seq_printf(s, " - read extent cache: %llu KB\n", si->ext_mem[EX_READ] >> 10); + seq_printf(s, " - block age extent cache: %llu KB\n", + si->ext_mem[EX_BLOCK_AGE] >> 10); seq_printf(s, " - paged : %llu KB\n", si->page_mem >> 10); } diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 305f969e3ad1..1bd38a78ebba 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -6,6 +6,10 @@ * Copyright (c) 2015 Samsung Electronics * Authors: Jaegeuk Kim * Chao Yu + * + * block_age-based extent cache added by: + * Copyright (c) 2022 xiaomi Co., Ltd. + * http://www.xiaomi.com/ */ #include @@ -18,6 +22,7 @@ static void __set_extent_info(struct extent_info *ei, unsigned int fofs, unsigned int len, block_t blk, bool keep_clen, + unsigned long age, unsigned long last_blocks, enum extent_type type) { ei->fofs = fofs; @@ -30,6 +35,9 @@ static void __set_extent_info(struct extent_info *ei, #ifdef CONFIG_F2FS_FS_COMPRESSION ei->c_len = 0; #endif + } else if (type == EX_BLOCK_AGE) { + ei->age = age; + ei->last_blocks = last_blocks; } } @@ -47,10 +55,27 @@ static bool __may_read_extent_tree(struct inode *inode) return S_ISREG(inode->i_mode); } +static bool __may_age_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (!test_opt(sbi, AGE_EXTENT_CACHE)) + return false; + /* don't cache block age info for cold file */ + if (is_inode_flag_set(inode, FI_COMPRESSED_FILE)) + return false; + if (file_is_cold(inode)) + return false; + + return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode); +} + static bool __init_may_extent_tree(struct inode *inode, enum extent_type type) { if (type == EX_READ) return __may_read_extent_tree(inode); + else if (type == EX_BLOCK_AGE) + return __may_age_extent_tree(inode); return false; } @@ -90,6 +115,11 @@ static bool __is_extent_mergeable(struct extent_info *back, #endif return (back->fofs + back->len == front->fofs && back->blk + back->len == front->blk); + } else if (type == EX_BLOCK_AGE) { + return (back->fofs + back->len == front->fofs && + abs(back->age - front->age) <= SAME_AGE_REGION && + abs(back->last_blocks - front->last_blocks) <= + SAME_AGE_REGION); } return false; } @@ -489,11 +519,22 @@ void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage) set_inode_flag(inode, FI_NO_EXTENT); } +void f2fs_init_age_extent_tree(struct inode *inode) +{ + if (!__init_may_extent_tree(inode, EX_BLOCK_AGE)) + return; + __grab_extent_tree(inode, EX_BLOCK_AGE); +} + void f2fs_init_extent_tree(struct inode *inode) { /* initialize read cache */ if (__init_may_extent_tree(inode, EX_READ)) __grab_extent_tree(inode, EX_READ); + + /* initialize block age cache */ + if (__init_may_extent_tree(inode, EX_BLOCK_AGE)) + __grab_extent_tree(inode, EX_BLOCK_AGE); } static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs, @@ -544,6 +585,8 @@ static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs, if (type == EX_READ) trace_f2fs_lookup_read_extent_tree_end(inode, pgofs, ei); + else if (type == EX_BLOCK_AGE) + trace_f2fs_lookup_age_extent_tree_end(inode, pgofs, ei); return ret; } @@ -642,6 +685,10 @@ static void __update_extent_tree_range(struct inode *inode, if (type == EX_READ) trace_f2fs_update_read_extent_tree_range(inode, fofs, len, tei->blk, 0); + else if (type == EX_BLOCK_AGE) + trace_f2fs_update_age_extent_tree_range(inode, fofs, len, + tei->age, tei->last_blocks); + write_lock(&et->lock); if (type == EX_READ) { @@ -694,6 +741,7 @@ static void __update_extent_tree_range(struct inode *inode, __set_extent_info(&ei, end, org_end - end, end - dei.fofs + dei.blk, false, + dei.age, dei.last_blocks, type); en1 = __insert_extent_tree(sbi, et, &ei, NULL, NULL, true); @@ -702,6 +750,7 @@ static void __update_extent_tree_range(struct inode *inode, __set_extent_info(&en->ei, end, en->ei.len - (end - dei.fofs), en->ei.blk + (end - dei.fofs), true, + dei.age, dei.last_blocks, type); next_en = en; } @@ -732,11 +781,15 @@ static void __update_extent_tree_range(struct inode *inode, en = next_en; } + if (type == EX_BLOCK_AGE) + goto update_age_extent_cache; + /* 3. update extent in read extent cache */ BUG_ON(type != EX_READ); if (tei->blk) { - __set_extent_info(&ei, fofs, len, tei->blk, false, EX_READ); + __set_extent_info(&ei, fofs, len, tei->blk, false, + 0, 0, EX_READ); if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) __insert_extent_tree(sbi, et, &ei, insert_p, insert_parent, leftmost); @@ -758,7 +811,17 @@ static void __update_extent_tree_range(struct inode *inode, et->largest_updated = false; updated = true; } + goto out_read_extent_cache; +update_age_extent_cache: + if (!tei->last_blocks) + goto out_read_extent_cache; + __set_extent_info(&ei, fofs, len, 0, false, + tei->age, tei->last_blocks, EX_BLOCK_AGE); + if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) + __insert_extent_tree(sbi, et, &ei, + insert_p, insert_parent, leftmost); +out_read_extent_cache: write_unlock(&et->lock); if (updated) @@ -796,7 +859,7 @@ void f2fs_update_read_extent_tree_range_compressed(struct inode *inode, if (en) goto unlock_out; - __set_extent_info(&ei, fofs, llen, blkaddr, true, EX_READ); + __set_extent_info(&ei, fofs, llen, blkaddr, true, 0, 0, EX_READ); ei.c_len = c_len; if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) @@ -807,6 +870,72 @@ void f2fs_update_read_extent_tree_range_compressed(struct inode *inode, } #endif +static unsigned long long __calculate_block_age(unsigned long long new, + unsigned long long old) +{ + unsigned long long diff; + + diff = (new >= old) ? new - (new - old) : new + (old - new); + + return div_u64(diff * LAST_AGE_WEIGHT, 100); +} + +/* This returns a new age and allocated blocks in ei */ +static int __get_new_block_age(struct inode *inode, struct extent_info *ei) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + loff_t f_size = i_size_read(inode); + unsigned long long cur_blocks = + atomic64_read(&sbi->allocated_data_blocks); + + /* + * When I/O is not aligned to a PAGE_SIZE, update will happen to the last + * file block even in seq write. So don't record age for newly last file + * block here. + */ + if ((f_size >> PAGE_SHIFT) == ei->fofs && f_size & (PAGE_SIZE - 1) && + ei->blk == NEW_ADDR) + return -EINVAL; + + if (__lookup_extent_tree(inode, ei->fofs, ei, EX_BLOCK_AGE)) { + unsigned long long cur_age; + + if (cur_blocks >= ei->last_blocks) + cur_age = cur_blocks - ei->last_blocks; + else + /* allocated_data_blocks overflow */ + cur_age = ULLONG_MAX - ei->last_blocks + cur_blocks; + + if (ei->age) + ei->age = __calculate_block_age(cur_age, ei->age); + else + ei->age = cur_age; + ei->last_blocks = cur_blocks; + WARN_ON(ei->age > cur_blocks); + return 0; + } + + f2fs_bug_on(sbi, ei->blk == NULL_ADDR); + + /* the data block was allocated for the first time */ + if (ei->blk == NEW_ADDR) + goto out; + + if (__is_valid_data_blkaddr(ei->blk) && + !f2fs_is_valid_blkaddr(sbi, ei->blk, DATA_GENERIC_ENHANCE)) { + f2fs_bug_on(sbi, 1); + return -EINVAL; + } +out: + /* + * init block age with zero, this can happen when the block age extent + * was reclaimed due to memory constraint or system reboot + */ + ei->age = 0; + ei->last_blocks = cur_blocks; + return 0; +} + static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type type) { struct extent_info ei; @@ -823,6 +952,10 @@ static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type typ ei.blk = NULL_ADDR; else ei.blk = dn->data_blkaddr; + } else if (type == EX_BLOCK_AGE) { + ei.blk = dn->data_blkaddr; + if (__get_new_block_age(dn->inode, &ei)) + return; } __update_extent_tree_range(dn->inode, &ei, type); } @@ -940,6 +1073,43 @@ unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi, int nr_shrin return __shrink_extent_tree(sbi, nr_shrink, EX_READ); } +/* block age extent cache operations */ +bool f2fs_lookup_age_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei) +{ + if (!__may_extent_tree(inode, EX_BLOCK_AGE)) + return false; + + return __lookup_extent_tree(inode, pgofs, ei, EX_BLOCK_AGE); +} + +void f2fs_update_age_extent_cache(struct dnode_of_data *dn) +{ + return __update_extent_cache(dn, EX_BLOCK_AGE); +} + +void f2fs_update_age_extent_cache_range(struct dnode_of_data *dn, + pgoff_t fofs, unsigned int len) +{ + struct extent_info ei = { + .fofs = fofs, + .len = len, + }; + + if (!__may_extent_tree(dn->inode, EX_BLOCK_AGE)) + return; + + __update_extent_tree_range(dn->inode, &ei, EX_BLOCK_AGE); +} + +unsigned int f2fs_shrink_age_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) +{ + if (!test_opt(sbi, AGE_EXTENT_CACHE)) + return 0; + + return __shrink_extent_tree(sbi, nr_shrink, EX_BLOCK_AGE); +} + static unsigned int __destroy_extent_node(struct inode *inode, enum extent_type type) { @@ -960,6 +1130,7 @@ static unsigned int __destroy_extent_node(struct inode *inode, void f2fs_destroy_extent_node(struct inode *inode) { __destroy_extent_node(inode, EX_READ); + __destroy_extent_node(inode, EX_BLOCK_AGE); } static void __drop_extent_tree(struct inode *inode, enum extent_type type) @@ -988,6 +1159,7 @@ static void __drop_extent_tree(struct inode *inode, enum extent_type type) void f2fs_drop_extent_tree(struct inode *inode) { __drop_extent_tree(inode, EX_READ); + __drop_extent_tree(inode, EX_BLOCK_AGE); } static void __destroy_extent_tree(struct inode *inode, enum extent_type type) @@ -1028,6 +1200,7 @@ static void __destroy_extent_tree(struct inode *inode, enum extent_type type) void f2fs_destroy_extent_tree(struct inode *inode) { __destroy_extent_tree(inode, EX_READ); + __destroy_extent_tree(inode, EX_BLOCK_AGE); } static void __init_extent_tree_info(struct extent_tree_info *eti) @@ -1045,6 +1218,12 @@ static void __init_extent_tree_info(struct extent_tree_info *eti) void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi) { __init_extent_tree_info(&sbi->extent_tree[EX_READ]); + __init_extent_tree_info(&sbi->extent_tree[EX_BLOCK_AGE]); + + /* initialize for block age extents */ + atomic64_set(&sbi->allocated_data_blocks, 0); + sbi->hot_data_age_threshold = DEF_HOT_DATA_AGE_THRESHOLD; + sbi->warm_data_age_threshold = DEF_WARM_DATA_AGE_THRESHOLD; } int __init f2fs_create_extent_cache(void) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index bb9bad7b7323..2e706ce88a00 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -103,6 +103,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_MERGE_CHECKPOINT 0x10000000 #define F2FS_MOUNT_GC_MERGE 0x20000000 #define F2FS_MOUNT_COMPRESS_CACHE 0x40000000 +#define F2FS_MOUNT_AGE_EXTENT_CACHE 0x80000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) @@ -603,9 +604,22 @@ enum { /* number of extent info in extent cache we try to shrink */ #define READ_EXTENT_CACHE_SHRINK_NUMBER 128 +/* number of age extent info in extent cache we try to shrink */ +#define AGE_EXTENT_CACHE_SHRINK_NUMBER 128 +#define LAST_AGE_WEIGHT 30 +#define SAME_AGE_REGION 1024 + +/* + * Define data block with age less than 1GB as hot data + * define data block with age less than 10GB but more than 1GB as warm data + */ +#define DEF_HOT_DATA_AGE_THRESHOLD 262144 +#define DEF_WARM_DATA_AGE_THRESHOLD 2621440 + /* extent cache type */ enum extent_type { EX_READ, + EX_BLOCK_AGE, NR_EXTENT_CACHES, }; @@ -633,6 +647,13 @@ struct extent_info { unsigned int c_len; #endif }; + /* block age extent_cache */ + struct { + /* block age of the extent */ + unsigned long long age; + /* last total blocks allocated */ + unsigned long long last_blocks; + }; }; }; @@ -1652,6 +1673,11 @@ struct f2fs_sb_info { /* for extent tree cache */ struct extent_tree_info extent_tree[NR_EXTENT_CACHES]; + atomic64_t allocated_data_blocks; /* for block age extent_cache */ + + /* The threshold used for hot and warm data seperation*/ + unsigned int hot_data_age_threshold; + unsigned int warm_data_age_threshold; /* basic filesystem units */ unsigned int log_sectors_per_block; /* log2 sectors per block */ @@ -3901,6 +3927,8 @@ struct f2fs_stat_info { unsigned long long ext_mem[NR_EXTENT_CACHES]; /* for read extent cache */ unsigned long long hit_largest; + /* for block age extent cache */ + unsigned long long allocated_data_blocks; int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta; int ndirty_data, ndirty_qdata; unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all; @@ -4212,6 +4240,16 @@ void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn, unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); +/* block age extent cache ops */ +void f2fs_init_age_extent_tree(struct inode *inode); +bool f2fs_lookup_age_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei); +void f2fs_update_age_extent_cache(struct dnode_of_data *dn); +void f2fs_update_age_extent_cache_range(struct dnode_of_data *dn, + pgoff_t fofs, unsigned int len); +unsigned int f2fs_shrink_age_extent_tree(struct f2fs_sb_info *sbi, + int nr_shrink); + /* * sysfs.c */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 662dedd02732..27e26cd700e3 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -638,6 +638,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + ofs; f2fs_update_read_extent_cache_range(dn, fofs, 0, len); + f2fs_update_age_extent_cache_range(dn, fofs, nr_free); dec_valid_block_count(sbi, dn->inode, nr_free); } dn->ofs_in_node = ofs; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 0266e111d1a6..8c8141343bab 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -480,6 +480,7 @@ static int do_read_inode(struct inode *inode) /* Need all the flag bits */ f2fs_init_read_extent_tree(inode, node_page); + f2fs_init_age_extent_tree(inode); f2fs_put_page(node_page, 1); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 1dcc3ef3d2e7..3c813c933225 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -60,7 +60,7 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) avail_ram = val.totalram - val.totalhigh; /* - * give 25%, 25%, 50%, 50%, 50% memory for each components respectively + * give 25%, 25%, 50%, 50%, 25%, 25% memory for each components respectively */ if (type == FREE_NIDS) { mem_size = (nm_i->nid_cnt[FREE_NID] * @@ -85,14 +85,16 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) sizeof(struct ino_entry); mem_size >>= PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); - } else if (type == READ_EXTENT_CACHE) { - struct extent_tree_info *eti = &sbi->extent_tree[EX_READ]; + } else if (type == READ_EXTENT_CACHE || type == AGE_EXTENT_CACHE) { + enum extent_type etype = type == READ_EXTENT_CACHE ? + EX_READ : EX_BLOCK_AGE; + struct extent_tree_info *eti = &sbi->extent_tree[etype]; mem_size = (atomic_read(&eti->total_ext_tree) * sizeof(struct extent_tree) + atomic_read(&eti->total_ext_node) * sizeof(struct extent_node)) >> PAGE_SHIFT; - res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == DISCARD_CACHE) { mem_size = (atomic_read(&dcc->discard_cmd_cnt) * sizeof(struct discard_cmd)) >> PAGE_SHIFT; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 0aa48704c77a..99454d46a939 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -147,6 +147,7 @@ enum mem_type { DIRTY_DENTS, /* indicates dirty dentry pages */ INO_ENTRIES, /* indicates inode entries */ READ_EXTENT_CACHE, /* indicates read extent cache */ + AGE_EXTENT_CACHE, /* indicates age extent cache */ DISCARD_CACHE, /* indicates memory of cached discard cmds */ COMPRESS_PAGE, /* indicates memory of cached compressed pages */ BASE_CHECK, /* check kernel status */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index fdba5160f8d4..765ec23f89e4 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -452,6 +452,11 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) f2fs_shrink_read_extent_tree(sbi, READ_EXTENT_CACHE_SHRINK_NUMBER); + /* try to shrink age extent cache when there is no enough memory */ + if (!f2fs_available_free_memory(sbi, AGE_EXTENT_CACHE)) + f2fs_shrink_age_extent_tree(sbi, + AGE_EXTENT_CACHE_SHRINK_NUMBER); + /* check the # of cached NAT entries */ if (!f2fs_available_free_memory(sbi, NAT_ENTRIES)) f2fs_try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK); @@ -3151,10 +3156,28 @@ static int __get_segment_type_4(struct f2fs_io_info *fio) } } +static int __get_age_segment_type(struct inode *inode, pgoff_t pgofs) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_info ei; + + if (f2fs_lookup_age_extent_cache(inode, pgofs, &ei)) { + if (!ei.age) + return NO_CHECK_TYPE; + if (ei.age <= sbi->hot_data_age_threshold) + return CURSEG_HOT_DATA; + if (ei.age <= sbi->warm_data_age_threshold) + return CURSEG_WARM_DATA; + return CURSEG_COLD_DATA; + } + return NO_CHECK_TYPE; +} + static int __get_segment_type_6(struct f2fs_io_info *fio) { if (fio->type == DATA) { struct inode *inode = fio->page->mapping->host; + int type; if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) return CURSEG_COLD_DATA_PINNED; @@ -3169,6 +3192,11 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) } if (file_is_cold(inode) || f2fs_need_compress_data(inode)) return CURSEG_COLD_DATA; + + type = __get_age_segment_type(inode, fio->page->index); + if (type != NO_CHECK_TYPE) + return type; + if (file_is_hot(inode) || is_inode_flag_set(inode, FI_HOT_DATA) || f2fs_is_cow_file(inode)) @@ -3287,6 +3315,9 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr)); + if (IS_DATASEG(type)) + atomic64_inc(&sbi->allocated_data_blocks); + up_write(&sit_i->sentry_lock); if (page && IS_NODESEG(type)) { @@ -3414,6 +3445,8 @@ void f2fs_outplace_write_data(struct dnode_of_data *dn, struct f2fs_summary sum; f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR); + if (fio->io_type == FS_DATA_IO || fio->io_type == FS_CP_DATA_IO) + f2fs_update_age_extent_cache(dn); set_summary(&sum, dn->nid, dn->ofs_in_node, fio->version); do_write_page(&sum, fio); f2fs_update_data_blkaddr(dn, fio->new_blkaddr); diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index 33c490e69ae3..83d6fb97dcae 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -59,6 +59,9 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink, /* count read extent cache entries */ count += __count_extent_cache(sbi, EX_READ); + /* count block age extent cache entries */ + count += __count_extent_cache(sbi, EX_BLOCK_AGE); + /* count clean nat cache entries */ count += __count_nat_entries(sbi); @@ -102,8 +105,11 @@ unsigned long f2fs_shrink_scan(struct shrinker *shrink, sbi->shrinker_run_no = run_no; + /* shrink extent cache entries */ + freed += f2fs_shrink_age_extent_tree(sbi, nr >> 2); + /* shrink read extent cache entries */ - freed += f2fs_shrink_read_extent_tree(sbi, nr >> 1); + freed += f2fs_shrink_read_extent_tree(sbi, nr >> 2); /* shrink clean nat cache entries */ if (freed < nr) @@ -134,6 +140,8 @@ void f2fs_join_shrinker(struct f2fs_sb_info *sbi) void f2fs_leave_shrinker(struct f2fs_sb_info *sbi) { f2fs_shrink_read_extent_tree(sbi, __count_extent_cache(sbi, EX_READ)); + f2fs_shrink_age_extent_tree(sbi, + __count_extent_cache(sbi, EX_BLOCK_AGE)); spin_lock(&f2fs_list_lock); list_del_init(&sbi->s_list); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 46a80121fe0b..c6e56b3c37fd 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -160,6 +160,7 @@ enum { Opt_nogc_merge, Opt_discard_unit, Opt_memory_mode, + Opt_age_extent_cache, Opt_err, }; @@ -237,6 +238,7 @@ static match_table_t f2fs_tokens = { {Opt_nogc_merge, "nogc_merge"}, {Opt_discard_unit, "discard_unit=%s"}, {Opt_memory_mode, "memory=%s"}, + {Opt_age_extent_cache, "age_extent_cache"}, {Opt_err, NULL}, }; @@ -1233,6 +1235,9 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) } kfree(name); break; + case Opt_age_extent_cache: + set_opt(sbi, AGE_EXTENT_CACHE); + break; default: f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value", p); @@ -1942,6 +1947,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",extent_cache"); else seq_puts(seq, ",noextent_cache"); + if (test_opt(sbi, AGE_EXTENT_CACHE)) + seq_puts(seq, ",age_extent_cache"); if (test_opt(sbi, DATA_FLUSH)) seq_puts(seq, ",data_flush"); @@ -2198,6 +2205,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool need_restart_flush = false, need_stop_flush = false; bool need_restart_discard = false, need_stop_discard = false; bool no_read_extent_cache = !test_opt(sbi, READ_EXTENT_CACHE); + bool no_age_extent_cache = !test_opt(sbi, AGE_EXTENT_CACHE); bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT); bool no_io_align = !F2FS_IO_ALIGNED(sbi); bool no_atgc = !test_opt(sbi, ATGC); @@ -2292,6 +2300,12 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) f2fs_warn(sbi, "switch extent_cache option is not allowed"); goto restore_opts; } + /* disallow enable/disable age extent_cache dynamically */ + if (no_age_extent_cache == !!test_opt(sbi, AGE_EXTENT_CACHE)) { + err = -EINVAL; + f2fs_warn(sbi, "switch age_extent_cache option is not allowed"); + goto restore_opts; + } if (no_io_align == !!F2FS_IO_ALIGNED(sbi)) { err = -EINVAL; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 8d16f649f2be..3af412cb901a 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -669,6 +669,24 @@ static ssize_t __sbi_store(struct f2fs_attr *a, return count; } + if (!strcmp(a->attr.name, "hot_data_age_threshold")) { + if (t == 0 || t >= sbi->warm_data_age_threshold) + return -EINVAL; + if (t == *ui) + return count; + *ui = (unsigned int)t; + return count; + } + + if (!strcmp(a->attr.name, "warm_data_age_threshold")) { + if (t == 0 || t <= sbi->hot_data_age_threshold) + return -EINVAL; + if (t == *ui) + return count; + *ui = (unsigned int)t; + return count; + } + *ui = (unsigned int)t; return count; @@ -921,6 +939,10 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, peak_atomic_write, peak_atomic_write); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, committed_atomic_block, committed_atomic_block); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, revoked_atomic_block, revoked_atomic_block); +/* For block age extent cache */ +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, hot_data_age_threshold, hot_data_age_threshold); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, warm_data_age_threshold, warm_data_age_threshold); + #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { ATTR_LIST(gc_urgent_sleep_time), @@ -1012,6 +1034,8 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(peak_atomic_write), ATTR_LIST(committed_atomic_block), ATTR_LIST(revoked_atomic_block), + ATTR_LIST(hot_data_age_threshold), + ATTR_LIST(warm_data_age_threshold), NULL, }; diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index d772cd2061cf..22a0fca0ec9d 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -49,6 +49,7 @@ TRACE_DEFINE_ENUM(CP_TRIMMED); TRACE_DEFINE_ENUM(CP_PAUSE); TRACE_DEFINE_ENUM(CP_RESIZE); TRACE_DEFINE_ENUM(EX_READ); +TRACE_DEFINE_ENUM(EX_BLOCK_AGE); #define show_block_type(type) \ __print_symbolic(type, \ @@ -164,6 +165,11 @@ TRACE_DEFINE_ENUM(EX_READ); { COMPRESS_LZ4, "LZ4" }, \ { COMPRESS_ZSTD, "ZSTD" }) +#define show_extent_type(type) \ + __print_symbolic(type, \ + { EX_READ, "Read" }, \ + { EX_BLOCK_AGE, "Block Age" }) + struct f2fs_sb_info; struct f2fs_io_info; struct extent_info; @@ -1551,7 +1557,7 @@ TRACE_EVENT(f2fs_lookup_extent_tree_start, TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, type = %s", show_dev_ino(__entry), __entry->pgofs, - __entry->type == EX_READ ? "Read" : "N/A") + show_extent_type(__entry->type)) ); TRACE_EVENT_CONDITION(f2fs_lookup_read_extent_tree_end, @@ -1590,6 +1596,45 @@ TRACE_EVENT_CONDITION(f2fs_lookup_read_extent_tree_end, __entry->blk) ); +TRACE_EVENT_CONDITION(f2fs_lookup_age_extent_tree_end, + + TP_PROTO(struct inode *inode, unsigned int pgofs, + struct extent_info *ei), + + TP_ARGS(inode, pgofs, ei), + + TP_CONDITION(ei), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(unsigned int, pgofs) + __field(unsigned int, fofs) + __field(unsigned int, len) + __field(unsigned long long, age) + __field(unsigned long long, blocks) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->pgofs = pgofs; + __entry->fofs = ei->fofs; + __entry->len = ei->len; + __entry->age = ei->age; + __entry->blocks = ei->last_blocks; + ), + + TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, " + "age_ext_info(fofs: %u, len: %u, age: %llu, blocks: %llu)", + show_dev_ino(__entry), + __entry->pgofs, + __entry->fofs, + __entry->len, + __entry->age, + __entry->blocks) +); + TRACE_EVENT(f2fs_update_read_extent_tree_range, TP_PROTO(struct inode *inode, unsigned int pgofs, unsigned int len, @@ -1625,6 +1670,41 @@ TRACE_EVENT(f2fs_update_read_extent_tree_range, __entry->c_len) ); +TRACE_EVENT(f2fs_update_age_extent_tree_range, + + TP_PROTO(struct inode *inode, unsigned int pgofs, unsigned int len, + unsigned long long age, + unsigned long long last_blks), + + TP_ARGS(inode, pgofs, len, age, last_blks), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(unsigned int, pgofs) + __field(unsigned int, len) + __field(unsigned long long, age) + __field(unsigned long long, blocks) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->pgofs = pgofs; + __entry->len = len; + __entry->age = age; + __entry->blocks = last_blks; + ), + + TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, " + "len = %u, age = %llu, blocks = %llu", + show_dev_ino(__entry), + __entry->pgofs, + __entry->len, + __entry->age, + __entry->blocks) +); + TRACE_EVENT(f2fs_shrink_extent_tree, TP_PROTO(struct f2fs_sb_info *sbi, unsigned int node_cnt, @@ -1650,7 +1730,7 @@ TRACE_EVENT(f2fs_shrink_extent_tree, show_dev(__entry->dev), __entry->node_cnt, __entry->tree_cnt, - __entry->type == EX_READ ? "Read" : "N/A") + show_extent_type(__entry->type)) ); TRACE_EVENT(f2fs_destroy_extent_tree, @@ -1677,7 +1757,7 @@ TRACE_EVENT(f2fs_destroy_extent_tree, TP_printk("dev = (%d,%d), ino = %lu, destroyed: node_cnt = %u, type = %s", show_dev_ino(__entry), __entry->node_cnt, - __entry->type == EX_READ ? "Read" : "N/A") + show_extent_type(__entry->type)) ); DECLARE_EVENT_CLASS(f2fs_sync_dirty_inodes, From 908a46704da7392e0f2d93dfd47e52c967a6672d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 7 Dec 2022 15:44:57 -0800 Subject: [PATCH 555/580] f2fs: avoid build warnining in extent_cache Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 1bd38a78ebba..fd3d804bdd83 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -672,7 +672,7 @@ static void __update_extent_tree_range(struct inode *inode, struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; struct extent_node *en = NULL, *en1 = NULL; struct extent_node *prev_en = NULL, *next_en = NULL; - struct extent_info ei, dei, prev; + struct extent_info ei, dei, prev = {}; struct rb_node **insert_p = NULL, *insert_parent = NULL; unsigned int fofs = tei->fofs, len = tei->len; unsigned int end = fofs + len; From 86d6a67f9ec717cc0c6bfd178aa3fb8a2d3ab3e8 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 7 Dec 2022 13:42:17 +0000 Subject: [PATCH 556/580] f2fs: Fix spelling mistake in label: free_bio_enrty_cache -> free_bio_entry_cache There is a spelling mistake in a label name. Fix it. Signed-off-by: Colin Ian King Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index c6e56b3c37fd..259118e1733e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4666,7 +4666,7 @@ static int __init init_f2fs_fs(void) goto free_iostat; err = f2fs_init_bioset(); if (err) - goto free_bio_enrty_cache; + goto free_bio_entry_cache; err = f2fs_init_compress_mempool(); if (err) goto free_bioset; @@ -4683,7 +4683,7 @@ static int __init init_f2fs_fs(void) f2fs_destroy_compress_mempool(); free_bioset: f2fs_destroy_bioset(); -free_bio_enrty_cache: +free_bio_entry_cache: f2fs_destroy_bio_entry_cache(); free_iostat: f2fs_destroy_iostat_processing(); From bdddff7fb680a0b37535322a4ac7417438b707c0 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Mon, 5 Dec 2022 22:56:03 +0800 Subject: [PATCH 557/580] f2fs: fix iostat parameter for discard Just like other data we count uses the number of bytes as the basic unit, but discard uses the number of cmds as the statistical unit. In fact the discard command contains the number of blocks, so let's change to the number of bytes as the base unit. Fixes: b0af6d491a6b ("f2fs: add app/fs io stat") Signed-off-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 765ec23f89e4..6f82a23d5987 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1187,7 +1187,7 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, atomic_inc(&dcc->issued_discard); - f2fs_update_iostat(sbi, FS_DISCARD, 1); + f2fs_update_iostat(sbi, FS_DISCARD, len * F2FS_BLKSIZE); lstart += len; start += len; From cad5c072d0535622debc701bb35c710fd96f4a1e Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Fri, 2 Dec 2022 12:58:41 +0800 Subject: [PATCH 558/580] f2fs: don't call f2fs_issue_discard_timeout() when discard_cmd_cnt is 0 in f2fs_put_super() No need to call f2fs_issue_discard_timeout() in f2fs_put_super, when no discard command requires issue. Since the caller of f2fs_issue_discard_timeout() usually judges the number of discard commands before using it. Let's move this logic to f2fs_issue_discard_timeout(). By the way, use f2fs_realtime_discard_enable to simplify the code. Reported-by: kernel test robot Signed-off-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 6 ++++-- fs/f2fs/super.c | 8 ++------ 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6f82a23d5987..ef77a89d0fe4 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1662,6 +1662,9 @@ bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi) struct discard_policy dpolicy; bool dropped; + if (!atomic_read(&dcc->discard_cmd_cnt)) + return false; + __init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity); __issue_discard_cmd(sbi, &dpolicy); @@ -2116,8 +2119,7 @@ static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi) * Recovery can cache discard commands, so in error path of * fill_super(), it needs to give a chance to handle them. */ - if (unlikely(atomic_read(&dcc->discard_cmd_cnt))) - f2fs_issue_discard_timeout(sbi); + f2fs_issue_discard_timeout(sbi); kfree(dcc); SM_I(sbi)->dcc_info = NULL; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 259118e1733e..14677c6f3691 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1567,8 +1567,7 @@ static void f2fs_put_super(struct super_block *sb) /* be sure to wait for any on-going discard commands */ dropped = f2fs_issue_discard_timeout(sbi); - if ((f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) && - !sbi->discard_blks && !dropped) { + if (f2fs_realtime_discard_enable(sbi) && !sbi->discard_blks && !dropped) { struct cp_control cpc = { .reason = CP_UMOUNT | CP_TRIMMED, }; @@ -2212,7 +2211,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool no_discard = !test_opt(sbi, DISCARD); bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE); bool block_unit_discard = f2fs_block_unit_discard(sbi); - struct discard_cmd_control *dcc; #ifdef CONFIG_QUOTA int i, j; #endif @@ -2399,10 +2397,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) goto restore_flush; need_stop_discard = true; } else { - dcc = SM_I(sbi)->dcc_info; f2fs_stop_discard_thread(sbi); - if (atomic_read(&dcc->discard_cmd_cnt)) - f2fs_issue_discard_timeout(sbi); + f2fs_issue_discard_timeout(sbi); need_restart_discard = true; } } From dae7b24407e3c460b61a81dbc89656fc215d7bf1 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Tue, 29 Nov 2022 12:15:23 +0800 Subject: [PATCH 559/580] f2fs: fix some format WARNING in debug.c and sysfs.c To fix: WARNING: function definition argument 'struct f2fs_attr *' should also have an identifier name + ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *); WARNING: return sysfs_emit(...) formats should include a terminating newline + return sysfs_emit(buf, "(none)"); WARNING: Prefer 'unsigned int' to bare use of 'unsigned' + unsigned npages = NODE_MAPPING(sbi)->nrpages; WARNING: Missing a blank line after declarations + unsigned npages = COMPRESS_MAPPING(sbi)->nrpages; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; WARNING: quoted string split across lines + seq_printf(s, "CP merge (Queued: %4d, Issued: %4d, Total: %4d, " + "Cur time: %4d(ms), Peak time: %4d(ms))\n", Signed-off-by: Yangtao Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 45 +++++++++++++++++++++++---------------------- fs/f2fs/sysfs.c | 10 +++++----- 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 8f1ef742551f..32af4f0c5735 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -318,18 +318,19 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->page_mem = 0; if (sbi->node_inode) { - unsigned npages = NODE_MAPPING(sbi)->nrpages; + unsigned long npages = NODE_MAPPING(sbi)->nrpages; si->page_mem += (unsigned long long)npages << PAGE_SHIFT; } if (sbi->meta_inode) { - unsigned npages = META_MAPPING(sbi)->nrpages; + unsigned long npages = META_MAPPING(sbi)->nrpages; si->page_mem += (unsigned long long)npages << PAGE_SHIFT; } #ifdef CONFIG_F2FS_FS_COMPRESSION if (sbi->compress_inode) { - unsigned npages = COMPRESS_MAPPING(sbi)->nrpages; + unsigned long npages = COMPRESS_MAPPING(sbi)->nrpages; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; } #endif @@ -477,28 +478,28 @@ static int stat_show(struct seq_file *s, void *v) si->meta_count[META_NAT]); seq_printf(s, " - ssa blocks : %u\n", si->meta_count[META_SSA]); - seq_printf(s, "CP merge (Queued: %4d, Issued: %4d, Total: %4d, " - "Cur time: %4d(ms), Peak time: %4d(ms))\n", - si->nr_queued_ckpt, si->nr_issued_ckpt, - si->nr_total_ckpt, si->cur_ckpt_time, - si->peak_ckpt_time); + seq_puts(s, "CP merge:\n"); + seq_printf(s, " - Queued : %4d\n", si->nr_queued_ckpt); + seq_printf(s, " - Issued : %4d\n", si->nr_issued_ckpt); + seq_printf(s, " - Total : %4d\n", si->nr_total_ckpt); + seq_printf(s, " - Cur time : %4d(ms)\n", si->cur_ckpt_time); + seq_printf(s, " - Peak time : %4d(ms)\n", si->peak_ckpt_time); seq_printf(s, "GC calls: %d (BG: %d)\n", si->call_count, si->bg_gc); seq_printf(s, " - data segments : %d (%d)\n", si->data_segs, si->bg_data_segs); seq_printf(s, " - node segments : %d (%d)\n", si->node_segs, si->bg_node_segs); - seq_printf(s, " - Reclaimed segs : Normal (%d), Idle CB (%d), " - "Idle Greedy (%d), Idle AT (%d), " - "Urgent High (%d), Urgent Mid (%d), " - "Urgent Low (%d)\n", - si->sbi->gc_reclaimed_segs[GC_NORMAL], - si->sbi->gc_reclaimed_segs[GC_IDLE_CB], - si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY], - si->sbi->gc_reclaimed_segs[GC_IDLE_AT], - si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH], - si->sbi->gc_reclaimed_segs[GC_URGENT_MID], - si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]); + seq_puts(s, " - Reclaimed segs :\n"); + seq_printf(s, " - Normal : %d\n", si->sbi->gc_reclaimed_segs[GC_NORMAL]); + seq_printf(s, " - Idle CB : %d\n", si->sbi->gc_reclaimed_segs[GC_IDLE_CB]); + seq_printf(s, " - Idle Greedy : %d\n", + si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY]); + seq_printf(s, " - Idle AT : %d\n", si->sbi->gc_reclaimed_segs[GC_IDLE_AT]); + seq_printf(s, " - Urgent High : %d\n", + si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH]); + seq_printf(s, " - Urgent Mid : %d\n", si->sbi->gc_reclaimed_segs[GC_URGENT_MID]); + seq_printf(s, " - Urgent Low : %d\n", si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]); seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks, si->bg_data_blks + si->bg_node_blks); seq_printf(s, " - data blocks : %d (%d)\n", si->data_blks, @@ -540,11 +541,11 @@ static int stat_show(struct seq_file *s, void *v) si->nr_dio_read, si->nr_dio_write); seq_printf(s, " - IO_R (Data: %4d, Node: %4d, Meta: %4d\n", si->nr_rd_data, si->nr_rd_node, si->nr_rd_meta); - seq_printf(s, " - IO_W (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), " - "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n", + seq_printf(s, " - IO_W (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), ", si->nr_wb_cp_data, si->nr_wb_data, si->nr_flushing, si->nr_flushed, - si->flush_list_empty, + si->flush_list_empty); + seq_printf(s, "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n", si->nr_discarding, si->nr_discarded, si->nr_discard_cmd, si->undiscard_blks); seq_printf(s, " - atomic IO: %4d (Max. %4d)\n", diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 3af412cb901a..2a1566ce4a0d 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -53,9 +53,9 @@ static const char *gc_mode_names[MAX_GC_MODE] = { struct f2fs_attr { struct attribute attr; - ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *); - ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *, - const char *, size_t); + ssize_t (*show)(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf); + ssize_t (*store)(struct f2fs_attr *a, struct f2fs_sb_info *sbi, + const char *buf, size_t len); int struct_type; int offset; int id; @@ -233,13 +233,13 @@ static ssize_t encoding_show(struct f2fs_attr *a, (sb->s_encoding->version >> 8) & 0xff, sb->s_encoding->version & 0xff); #endif - return sprintf(buf, "(none)"); + return sprintf(buf, "(none)\n"); } static ssize_t mounted_time_sec_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return sprintf(buf, "%llu", SIT_I(sbi)->mounted_time); + return sprintf(buf, "%llu\n", SIT_I(sbi)->mounted_time); } #ifdef CONFIG_F2FS_STAT_FS From b9aeb147225616494256fcf913c559afd4088a05 Mon Sep 17 00:00:00 2001 From: Yuwei Guan Date: Sun, 11 Dec 2022 21:08:41 +0800 Subject: [PATCH 560/580] f2fs: reset wait_ms to default if any of the victims have been selected In non-foreground gc mode, if no victim is selected, the gc process will wait for no_gc_sleep_time before waking up again. In this subsequent time, even though a victim will be selected, the gc process still waits for no_gc_sleep_time before waking up. The configuration of wait_ms is not reasonable. After any of the victims have been selected, we need to reset wait_ms to default sleep time from no_gc_sleep_time. Signed-off-by: Yuwei Guan Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 30673f49ea54..7491f912abdf 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -141,6 +141,10 @@ static int gc_thread_func(void *data) /* don't bother wait_ms by foreground gc */ if (!foreground) wait_ms = gc_th->no_gc_sleep_time; + } else { + /* reset wait_ms to default sleep time */ + if (wait_ms == gc_th->no_gc_sleep_time) + wait_ms = gc_th->min_sleep_time; } if (foreground) From 8736793a178ba367c69f822df6a69297cdec3594 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 25 Nov 2024 15:46:16 +0100 Subject: [PATCH 561/580] UPSTREAM: ALSA: usb-audio: Fix out of bounds reads when finding clock sources MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit a3dd4d63eeb452cfb064a13862fb376ab108f6a6 upstream. The current USB-audio driver code doesn't check bLength of each descriptor at traversing for clock descriptors. That is, when a device provides a bogus descriptor with a shorter bLength, the driver might hit out-of-bounds reads. For addressing it, this patch adds sanity checks to the validator functions for the clock descriptor traversal. When the descriptor length is shorter than expected, it's skipped in the loop. For the clock source and clock multiplier descriptors, we can just check bLength against the sizeof() of each descriptor type. OTOH, the clock selector descriptor of UAC2 and UAC3 has an array of bNrInPins elements and two more fields at its tail, hence those have to be checked in addition to the sizeof() check. Bug: 382239029 Reported-by: Benoît Sevens Cc: Link: https://lore.kernel.org/20241121140613.3651-1-bsevens@google.com Link: https://patch.msgid.link/20241125144629.20757-1-tiwai@suse.de Signed-off-by: Takashi Iwai Signed-off-by: Benoît Sevens Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 45a92cbc88e4013bfed7fd2ccab3ade45f8e896b) Signed-off-by: Lee Jones Change-Id: I13e916ffd46fce6fd08f7b9f96cea82bb4bc475d --- sound/usb/clock.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/sound/usb/clock.c b/sound/usb/clock.c index d1455fb2c6fc..7411e9c5ba8c 100644 --- a/sound/usb/clock.c +++ b/sound/usb/clock.c @@ -35,6 +35,10 @@ #include "clock.h" #include "quirks.h" +/* check whether the descriptor bLength has the minimal length */ +#define DESC_LENGTH_CHECK(p) \ + (p->bLength >= sizeof(*p)) + static void *find_uac_clock_desc(struct usb_host_interface *iface, int id, bool (*validator)(void *, int), u8 type) { @@ -52,36 +56,60 @@ static void *find_uac_clock_desc(struct usb_host_interface *iface, int id, static bool validate_clock_source_v2(void *p, int id) { struct uac_clock_source_descriptor *cs = p; + if (!DESC_LENGTH_CHECK(cs)) + return false; return cs->bClockID == id; } static bool validate_clock_source_v3(void *p, int id) { struct uac3_clock_source_descriptor *cs = p; + if (!DESC_LENGTH_CHECK(cs)) + return false; return cs->bClockID == id; } static bool validate_clock_selector_v2(void *p, int id) { struct uac_clock_selector_descriptor *cs = p; - return cs->bClockID == id; + if (!DESC_LENGTH_CHECK(cs)) + return false; + if (cs->bClockID != id) + return false; + /* additional length check for baCSourceID array (in bNrInPins size) + * and two more fields (which sizes depend on the protocol) + */ + return cs->bLength >= sizeof(*cs) + cs->bNrInPins + + 1 /* bmControls */ + 1 /* iClockSelector */; } static bool validate_clock_selector_v3(void *p, int id) { struct uac3_clock_selector_descriptor *cs = p; - return cs->bClockID == id; + if (!DESC_LENGTH_CHECK(cs)) + return false; + if (cs->bClockID != id) + return false; + /* additional length check for baCSourceID array (in bNrInPins size) + * and two more fields (which sizes depend on the protocol) + */ + return cs->bLength >= sizeof(*cs) + cs->bNrInPins + + 4 /* bmControls */ + 2 /* wCSelectorDescrStr */; } static bool validate_clock_multiplier_v2(void *p, int id) { struct uac_clock_multiplier_descriptor *cs = p; + if (!DESC_LENGTH_CHECK(cs)) + return false; return cs->bClockID == id; } static bool validate_clock_multiplier_v3(void *p, int id) { struct uac3_clock_multiplier_descriptor *cs = p; + if (!DESC_LENGTH_CHECK(cs)) + return false; return cs->bClockID == id; } From 5d4b707b4554f5763fab52448286f91c42417fc3 Mon Sep 17 00:00:00 2001 From: Abhinav Parihar Date: Tue, 17 Dec 2024 18:34:34 +0530 Subject: [PATCH 562/580] BACKPORT: dsp-kernel: Add attribute and flag checks during map creation A persistence map is expected to hold refs=2 during its creation. However, the Fuzzy test can create a persistence map by configuring a mismatch between attributes and flags using the KEEP MAP attribute and FD NOMAP flags. This sets the map reference count to 1. The user then calls fastrpc_internal_munmap_fd to free the map since it doesn't check flags, which can cause a use-after-free (UAF) for the file map and shared buffer. Add a check to restrict DMA handle maps with invalid attributes. Change-Id: I2f024ef99cc2a0487010504166e3af3433d5302d Acked-by: Santosh Signed-off-by: Abhinav Parihar --- drivers/char/adsprpc.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/char/adsprpc.c b/drivers/char/adsprpc.c index 8bc094fd2d80..dcffd78a1b64 100644 --- a/drivers/char/adsprpc.c +++ b/drivers/char/adsprpc.c @@ -1026,6 +1026,12 @@ static int fastrpc_mmap_create(struct fastrpc_file *fl, int fd, map->size = len; map->va = (uintptr_t)region_vaddr; } else if (mflags == FASTRPC_DMAHANDLE_NOMAP) { + if (map->attr & FASTRPC_ATTR_KEEP_MAP) { + pr_err("adsprpc: %s: Invalid attribute 0x%x for fd %d\n", + __func__, map->attr, fd); + err = -EINVAL; + goto bail; + } VERIFY(err, !IS_ERR_OR_NULL(map->buf = dma_buf_get(fd))); if (err) goto bail; From 65e66f3b1154f65c9e670a8409a2d1b166af48b5 Mon Sep 17 00:00:00 2001 From: Dharmendra Tiwari Date: Tue, 3 Sep 2024 23:06:17 -0700 Subject: [PATCH 563/580] qcacld-3.0: Correcting the TSInfo structure size according to the Spec According to spec the TSinfo size should be 4 bytes. To fix this issue,TSInfo size is increased to 4bytes aligning with the current standard. CRs-Fixed: 3910625 Change-Id: I7979fa84af0295d21d4afe1b876af494a5b8fed8 --- .../staging/qcacld-3.0/core/mac/src/cfg/cfgUtil/dot11f.frms | 2 +- drivers/staging/qcacld-3.0/core/mac/src/include/dot11f.h | 4 ++-- .../qcacld-3.0/core/mac/src/sys/legacy/src/utils/src/dot11f.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/staging/qcacld-3.0/core/mac/src/cfg/cfgUtil/dot11f.frms b/drivers/staging/qcacld-3.0/core/mac/src/cfg/cfgUtil/dot11f.frms index ed3381df616f..cb1faf5623fb 100644 --- a/drivers/staging/qcacld-3.0/core/mac/src/cfg/cfgUtil/dot11f.frms +++ b/drivers/staging/qcacld-3.0/core/mac/src/cfg/cfgUtil/dot11f.frms @@ -367,7 +367,7 @@ FF SMPowerModeSet (1) //7.3.1.25 } } -FF TSInfo (3) // 7.3.2.30 +FF TSInfo (4) // 7.3.2.30 { { traffic_type: 1; diff --git a/drivers/staging/qcacld-3.0/core/mac/src/include/dot11f.h b/drivers/staging/qcacld-3.0/core/mac/src/include/dot11f.h index 507f905e17b7..247f261d8cc3 100644 --- a/drivers/staging/qcacld-3.0/core/mac/src/include/dot11f.h +++ b/drivers/staging/qcacld-3.0/core/mac/src/include/dot11f.h @@ -26,7 +26,7 @@ * * * This file was automatically generated by 'framesc' - * Wed Sep 29 13:23:21 2021 from the following file(s): + * Tue Sep 3 23:04:38 2024 from the following file(s): * * dot11f.frms * @@ -441,7 +441,7 @@ typedef struct sDot11fFfTSInfo { uint32_t unused:15; } tDot11fFfTSInfo; -#define DOT11F_FF_TSINFO_LEN (3) +#define DOT11F_FF_TSINFO_LEN (4) void dot11f_unpack_ff_ts_info(tpAniSirGlobal, uint8_t *, tDot11fFfTSInfo *); diff --git a/drivers/staging/qcacld-3.0/core/mac/src/sys/legacy/src/utils/src/dot11f.c b/drivers/staging/qcacld-3.0/core/mac/src/sys/legacy/src/utils/src/dot11f.c index 503fc06344ba..04141282c10b 100644 --- a/drivers/staging/qcacld-3.0/core/mac/src/sys/legacy/src/utils/src/dot11f.c +++ b/drivers/staging/qcacld-3.0/core/mac/src/sys/legacy/src/utils/src/dot11f.c @@ -24,7 +24,7 @@ * * * This file was automatically generated by 'framesc' - * Wed Sep 29 13:23:21 2021 from the following file(s): + * Tue Sep 3 23:04:38 2024 from the following file(s): * * dot11f.frms * @@ -16338,7 +16338,7 @@ uint32_t dot11f_get_packed_del_ts_size(tpAniSirGlobal pCtx, tDot11fDelTS *pFrm, uint32_t *pnNeeded) { uint32_t status = 0; - *pnNeeded = 7; + *pnNeeded = 8; status = get_packed_size_core(pCtx, (uint8_t *)pFrm, pnNeeded, IES_DelTS); return status; From 977fbf262198e1ca0c2f8b6f2c3079a2b1e2e9ce Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 25 Mar 2022 17:31:05 +0100 Subject: [PATCH 564/580] Revert "ANDROID: fs: FS tracepoints to track IO." 5.18-rc1 has many merge issues and the block io path has been rewritten, so the tracepoints added here do not work properly anymore (and break the build.) If this is really still needed (hint, I strongly doubt it), it can be redesigned and added back after 5.18-rc1 is released. Cc: Mohan Srinivasan Cc: Amit Pundir Cc: Alistair Strachan Fixes: f2fe7bac26dc ("ANDROID: fs: FS tracepoints to track IO.") Signed-off-by: Greg Kroah-Hartman [Jaegeuk Kim: remove fsync tracepoints as well] Change-Id: I64981f2f692a434b976e50677d3414037d5ee409 --- fs/ext4/inline.c | 14 ---- fs/ext4/inode.c | 54 ------------ fs/ext4/readpage.c | 47 +---------- fs/f2fs/data.c | 89 ++------------------ fs/f2fs/file.c | 11 --- fs/f2fs/inline.c | 18 ---- fs/mpage.c | 38 --------- include/trace/events/android_fs.h | 74 ---------------- include/trace/events/android_fs_template.h | 98 ---------------------- 9 files changed, 9 insertions(+), 434 deletions(-) delete mode 100644 include/trace/events/android_fs.h delete mode 100644 include/trace/events/android_fs_template.h diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 78deaeef83fa..6eb40f6850a1 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -12,7 +12,6 @@ #include "ext4.h" #include "xattr.h" #include "truncate.h" -#include #define EXT4_XATTR_SYSTEM_DATA "data" #define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) @@ -518,17 +517,6 @@ int ext4_readpage_inline(struct inode *inode, struct page *page) return -EAGAIN; } - if (trace_android_fs_dataread_start_enabled()) { - char *path, pathbuf[MAX_TRACE_PATHBUF_LEN]; - - path = android_fstrace_get_pathname(pathbuf, - MAX_TRACE_PATHBUF_LEN, - inode); - trace_android_fs_dataread_start(inode, page_offset(page), - PAGE_SIZE, current->pid, - path, current->comm); - } - /* * Current inline data can only exist in the 1st page, * So for all the other pages, just set them uptodate. @@ -540,8 +528,6 @@ int ext4_readpage_inline(struct inode *inode, struct page *page) SetPageUptodate(page); } - trace_android_fs_dataread_end(inode, page_offset(page), PAGE_SIZE); - up_read(&EXT4_I(inode)->xattr_sem); unlock_page(page); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 87629b61dbe1..e187f540c088 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -47,7 +47,6 @@ #include "truncate.h" #include -#include #define MPAGE_DA_EXTENT_TAIL 0x01 @@ -1271,16 +1270,6 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; - if (trace_android_fs_datawrite_start_enabled()) { - char *path, pathbuf[MAX_TRACE_PATHBUF_LEN]; - - path = android_fstrace_get_pathname(pathbuf, - MAX_TRACE_PATHBUF_LEN, - inode); - trace_android_fs_datawrite_start(inode, pos, len, - current->pid, path, - current->comm); - } trace_ext4_write_begin(inode, pos, len, flags); /* * Reserve one block more for addition to orphan list in case @@ -1430,7 +1419,6 @@ static int ext4_write_end(struct file *file, int inline_data = ext4_has_inline_data(inode); bool verity = ext4_verity_in_progress(inode); - trace_android_fs_datawrite_end(inode, pos, len); trace_ext4_write_end(inode, pos, len, copied); if (inline_data && ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { @@ -1542,7 +1530,6 @@ static int ext4_journalled_write_end(struct file *file, int inline_data = ext4_has_inline_data(inode); bool verity = ext4_verity_in_progress(inode); - trace_android_fs_datawrite_end(inode, pos, len); trace_ext4_journalled_write_end(inode, pos, len, copied); from = pos & (PAGE_SIZE - 1); to = from + len; @@ -3110,16 +3097,6 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, len, flags, pagep, fsdata); } *fsdata = (void *)0; - if (trace_android_fs_datawrite_start_enabled()) { - char *path, pathbuf[MAX_TRACE_PATHBUF_LEN]; - - path = android_fstrace_get_pathname(pathbuf, - MAX_TRACE_PATHBUF_LEN, - inode); - trace_android_fs_datawrite_start(inode, pos, len, - current->pid, - path, current->comm); - } trace_ext4_da_write_begin(inode, pos, len, flags); if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { @@ -3238,7 +3215,6 @@ static int ext4_da_write_end(struct file *file, return ext4_write_end(file, mapping, pos, len, copied, page, fsdata); - trace_android_fs_datawrite_end(inode, pos, len); trace_ext4_da_write_end(inode, pos, len, copied); start = pos & (PAGE_SIZE - 1); end = start + copied - 1; @@ -3962,7 +3938,6 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter) size_t count = iov_iter_count(iter); loff_t offset = iocb->ki_pos; ssize_t ret; - int rw = iov_iter_rw(iter); if (!fscrypt_dio_supported(iocb, iter)) return 0; @@ -3980,28 +3955,6 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (ext4_has_inline_data(inode)) return 0; - if (trace_android_fs_dataread_start_enabled() && - (rw == READ)) { - char *path, pathbuf[MAX_TRACE_PATHBUF_LEN]; - - path = android_fstrace_get_pathname(pathbuf, - MAX_TRACE_PATHBUF_LEN, - inode); - trace_android_fs_dataread_start(inode, offset, count, - current->pid, path, - current->comm); - } - if (trace_android_fs_datawrite_start_enabled() && - (rw == WRITE)) { - char *path, pathbuf[MAX_TRACE_PATHBUF_LEN]; - - path = android_fstrace_get_pathname(pathbuf, - MAX_TRACE_PATHBUF_LEN, - inode); - trace_android_fs_datawrite_start(inode, offset, count, - current->pid, path, - current->comm); - } trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); if (iov_iter_rw(iter) == READ) ret = ext4_direct_IO_read(iocb, iter); @@ -4009,13 +3962,6 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter) ret = ext4_direct_IO_write(iocb, iter); trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret); - if (trace_android_fs_dataread_start_enabled() && - (rw == READ)) - trace_android_fs_dataread_end(inode, offset, count); - if (trace_android_fs_datawrite_start_enabled() && - (rw == WRITE)) - trace_android_fs_datawrite_end(inode, offset, count); - return ret; } diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index eb9c630cdbb7..681542a53762 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -46,7 +46,6 @@ #include #include "ext4.h" -#include #define NUM_PREALLOC_POST_READ_CTXS 128 @@ -147,17 +146,6 @@ static bool bio_post_read_required(struct bio *bio) return bio->bi_private && !bio->bi_status; } -static void -ext4_trace_read_completion(struct bio *bio) -{ - struct page *first_page = bio->bi_io_vec[0].bv_page; - - if (first_page != NULL) - trace_android_fs_dataread_end(first_page->mapping->host, - page_offset(first_page), - bio->bi_iter.bi_size); -} - /* * I/O completion handler for multipage BIOs. * @@ -172,9 +160,6 @@ ext4_trace_read_completion(struct bio *bio) */ static void mpage_end_io(struct bio *bio) { - if (trace_android_fs_dataread_start_enabled()) - ext4_trace_read_completion(bio); - if (bio_post_read_required(bio)) { struct bio_post_read_ctx *ctx = bio->bi_private; @@ -224,30 +209,6 @@ static inline loff_t ext4_readpage_limit(struct inode *inode) return i_size_read(inode); } -static void -ext4_submit_bio_read(struct bio *bio) -{ - if (trace_android_fs_dataread_start_enabled()) { - struct page *first_page = bio->bi_io_vec[0].bv_page; - - if (first_page != NULL) { - char *path, pathbuf[MAX_TRACE_PATHBUF_LEN]; - - path = android_fstrace_get_pathname(pathbuf, - MAX_TRACE_PATHBUF_LEN, - first_page->mapping->host); - trace_android_fs_dataread_start( - first_page->mapping->host, - page_offset(first_page), - bio->bi_iter.bi_size, - current->pid, - path, - current->comm); - } - } - submit_bio(bio); -} - int ext4_mpage_readpages(struct address_space *mapping, struct list_head *pages, struct page *page, unsigned nr_pages, bool is_readahead) @@ -395,7 +356,7 @@ int ext4_mpage_readpages(struct address_space *mapping, if (bio && (last_block_in_bio != blocks[0] - 1 || !fscrypt_mergeable_bio(bio, inode, next_block))) { submit_and_realloc: - ext4_submit_bio_read(bio); + submit_bio(bio); bio = NULL; } if (bio == NULL) { @@ -428,14 +389,14 @@ int ext4_mpage_readpages(struct address_space *mapping, if (((map.m_flags & EXT4_MAP_BOUNDARY) && (relative_block == map.m_len)) || (first_hole != blocks_per_page)) { - ext4_submit_bio_read(bio); + submit_bio(bio); bio = NULL; } else last_block_in_bio = blocks[blocks_per_page - 1]; goto next_page; confused: if (bio) { - ext4_submit_bio_read(bio); + submit_bio(bio); bio = NULL; } if (!PageUptodate(page)) @@ -448,7 +409,7 @@ int ext4_mpage_readpages(struct address_space *mapping, } BUG_ON(pages && !list_empty(pages)); if (bio) - ext4_submit_bio_read(bio); + submit_bio(bio); return 0; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7a2bc98af235..2f12ca2d43ed 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -25,7 +25,6 @@ #include "segment.h" #include "iostat.h" #include -#include #define NUM_PREALLOC_POST_READ_CTXS 128 @@ -268,7 +267,6 @@ static void f2fs_post_read_work(struct work_struct *work) static void f2fs_read_end_io(struct bio *bio) { - struct page *first_page = bio->bi_io_vec[0].bv_page; struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio)); struct bio_post_read_ctx *ctx; bool intask = in_task(); @@ -286,13 +284,6 @@ static void f2fs_read_end_io(struct bio *bio) return; } - if (first_page != NULL && - __read_io_type(first_page) == F2FS_RD_DATA) { - trace_android_fs_dataread_end(first_page->mapping->host, - page_offset(first_page), - bio->bi_iter.bi_size); - } - if (ctx) { unsigned int enabled_steps = ctx->enabled_steps & (STEP_DECRYPT | STEP_DECOMPRESS); @@ -562,32 +553,6 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, submit_bio(bio); } -static void __f2fs_submit_read_bio(struct f2fs_sb_info *sbi, - struct bio *bio, enum page_type type) -{ - if (trace_android_fs_dataread_start_enabled() && (type == DATA)) { - struct page *first_page = bio->bi_io_vec[0].bv_page; - - if (first_page != NULL && - __read_io_type(first_page) == F2FS_RD_DATA) { - char *path, pathbuf[MAX_TRACE_PATHBUF_LEN]; - - path = android_fstrace_get_pathname(pathbuf, - MAX_TRACE_PATHBUF_LEN, - first_page->mapping->host); - - trace_android_fs_dataread_start( - first_page->mapping->host, - page_offset(first_page), - bio->bi_iter.bi_size, - current->pid, - path, - current->comm); - } - } - __submit_bio(sbi, bio, type); -} - void f2fs_submit_bio(struct f2fs_sb_info *sbi, struct bio *bio, enum page_type type) { @@ -774,10 +739,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) inc_page_count(fio->sbi, is_read_io(fio->op) ? __read_io_type(page) : WB_DATA_TYPE(fio->page)); - if (is_read_io(fio->op)) - __f2fs_submit_read_bio(fio->sbi, bio, fio->type); - else - __submit_bio(fio->sbi, bio, fio->type); + __submit_bio(fio->sbi, bio, fio->type); return 0; } @@ -1140,7 +1102,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page, ClearPageError(page); inc_page_count(sbi, F2FS_RD_DATA); f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE); - __f2fs_submit_read_bio(sbi, bio, DATA); + __submit_bio(sbi, bio, DATA); return 0; } @@ -2228,7 +2190,7 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page, *last_block_in_bio, block_nr) || !f2fs_crypt_mergeable_bio(bio, inode, page->index, NULL))) { submit_and_realloc: - __f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); + __submit_bio(F2FS_I_SB(inode), bio, DATA); bio = NULL; } @@ -2259,7 +2221,7 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page, goto out; confused: if (bio) { - __f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); + __submit_bio(F2FS_I_SB(inode), bio, DATA); bio = NULL; } unlock_page(page); @@ -2576,7 +2538,7 @@ int f2fs_mpage_readpages(struct address_space *mapping, } BUG_ON(pages && !list_empty(pages)); if (bio) - __f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); + __submit_bio(F2FS_I_SB(inode), bio, DATA); if (pages && is_readahead && !drop_ra) WRITE_ONCE(F2FS_I(inode)->ra_offset, -1); @@ -3649,16 +3611,6 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, block_t blkaddr = NULL_ADDR; int err = 0; - if (trace_android_fs_datawrite_start_enabled()) { - char *path, pathbuf[MAX_TRACE_PATHBUF_LEN]; - - path = android_fstrace_get_pathname(pathbuf, - MAX_TRACE_PATHBUF_LEN, - inode); - trace_android_fs_datawrite_start(inode, pos, len, - current->pid, path, - current->comm); - } trace_f2fs_write_begin(inode, pos, len, flags); if (!f2fs_is_checkpoint_ready(sbi)) { @@ -3784,7 +3736,6 @@ static int f2fs_write_end(struct file *file, { struct inode *inode = page->mapping->host; - trace_android_fs_datawrite_end(inode, pos, len); trace_f2fs_write_end(inode, pos, len, copied); /* @@ -3920,29 +3871,6 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) trace_f2fs_direct_IO_enter(inode, offset, count, rw); - if (trace_android_fs_dataread_start_enabled() && - (rw == READ)) { - char *path, pathbuf[MAX_TRACE_PATHBUF_LEN]; - - path = android_fstrace_get_pathname(pathbuf, - MAX_TRACE_PATHBUF_LEN, - inode); - trace_android_fs_dataread_start(inode, offset, - count, current->pid, path, - current->comm); - } - if (trace_android_fs_datawrite_start_enabled() && - (rw == WRITE)) { - char *path, pathbuf[MAX_TRACE_PATHBUF_LEN]; - - path = android_fstrace_get_pathname(pathbuf, - MAX_TRACE_PATHBUF_LEN, - inode); - trace_android_fs_datawrite_start(inode, offset, count, - current->pid, path, - current->comm); - } - if (iocb->ki_flags & IOCB_NOWAIT) { if (!f2fs_down_read_trylock(&fi->i_gc_rwsem[rw])) { err = -EAGAIN; @@ -3991,13 +3919,6 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } out: - if (trace_android_fs_dataread_start_enabled() && - (rw == READ)) - trace_android_fs_dataread_end(inode, offset, count); - if (trace_android_fs_datawrite_start_enabled() && - (rw == WRITE)) - trace_android_fs_datawrite_end(inode, offset, count); - trace_f2fs_direct_IO_exit(inode, offset, count, rw, err); return err; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 1e9aebea235d..eda802839704 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -31,7 +31,6 @@ #include "gc.h" #include "iostat.h" #include -#include #include static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf) @@ -273,15 +272,6 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, trace_f2fs_sync_file_enter(inode); - if (trace_android_fs_fsync_start_enabled()) { - char *path, pathbuf[MAX_TRACE_PATHBUF_LEN]; - - path = android_fstrace_get_pathname(pathbuf, - MAX_TRACE_PATHBUF_LEN, inode); - trace_android_fs_fsync_start(inode, - current->pid, path, current->comm); - } - if (S_ISDIR(inode->i_mode)) goto go_write; @@ -399,7 +389,6 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, f2fs_update_time(sbi, REQ_TIME); out: trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret); - trace_android_fs_fsync_end(inode, start, end - start); return ret; } diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index b971abfe97ac..758a8730541e 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -12,7 +12,6 @@ #include "f2fs.h" #include "node.h" #include -#include static bool support_inline_data(struct inode *inode) { @@ -105,29 +104,14 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page) { struct page *ipage; - if (trace_android_fs_dataread_start_enabled()) { - char *path, pathbuf[MAX_TRACE_PATHBUF_LEN]; - - path = android_fstrace_get_pathname(pathbuf, - MAX_TRACE_PATHBUF_LEN, - inode); - trace_android_fs_dataread_start(inode, page_offset(page), - PAGE_SIZE, current->pid, - path, current->comm); - } - ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) { - trace_android_fs_dataread_end(inode, page_offset(page), - PAGE_SIZE); unlock_page(page); return PTR_ERR(ipage); } if (!f2fs_has_inline_data(inode)) { f2fs_put_page(ipage, 1); - trace_android_fs_dataread_end(inode, page_offset(page), - PAGE_SIZE); return -EAGAIN; } @@ -139,8 +123,6 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page) if (!PageUptodate(page)) SetPageUptodate(page); f2fs_put_page(ipage, 1); - trace_android_fs_dataread_end(inode, page_offset(page), - PAGE_SIZE); unlock_page(page); return 0; } diff --git a/fs/mpage.c b/fs/mpage.c index ddf7bb80c68f..c820dc9bebab 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -32,16 +32,6 @@ #include #include "internal.h" -#define CREATE_TRACE_POINTS -#include - -EXPORT_TRACEPOINT_SYMBOL(android_fs_datawrite_start); -EXPORT_TRACEPOINT_SYMBOL(android_fs_datawrite_end); -EXPORT_TRACEPOINT_SYMBOL(android_fs_dataread_start); -EXPORT_TRACEPOINT_SYMBOL(android_fs_dataread_end); -EXPORT_TRACEPOINT_SYMBOL(android_fs_fsync_start); -EXPORT_TRACEPOINT_SYMBOL(android_fs_fsync_end); - /* * I/O completion handler for multipage BIOs. * @@ -59,16 +49,6 @@ static void mpage_end_io(struct bio *bio) struct bio_vec *bv; int i; - if (trace_android_fs_dataread_end_enabled() && - (bio_data_dir(bio) == READ)) { - struct page *first_page = bio->bi_io_vec[0].bv_page; - - if (first_page != NULL) - trace_android_fs_dataread_end(first_page->mapping->host, - page_offset(first_page), - bio->bi_iter.bi_size); - } - bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; page_endio(page, bio_op(bio), @@ -80,24 +60,6 @@ static void mpage_end_io(struct bio *bio) static struct bio *mpage_bio_submit(int op, int op_flags, struct bio *bio) { - if (trace_android_fs_dataread_start_enabled() && (op == REQ_OP_READ)) { - struct page *first_page = bio->bi_io_vec[0].bv_page; - - if (first_page != NULL) { - char *path, pathbuf[MAX_TRACE_PATHBUF_LEN]; - - path = android_fstrace_get_pathname(pathbuf, - MAX_TRACE_PATHBUF_LEN, - first_page->mapping->host); - trace_android_fs_dataread_start( - first_page->mapping->host, - page_offset(first_page), - bio->bi_iter.bi_size, - current->pid, - path, - current->comm); - } - } bio->bi_end_io = mpage_end_io; bio_set_op_attrs(bio, op, op_flags); guard_bio_eod(op, bio); diff --git a/include/trace/events/android_fs.h b/include/trace/events/android_fs.h deleted file mode 100644 index 0ee4a07f0240..000000000000 --- a/include/trace/events/android_fs.h +++ /dev/null @@ -1,74 +0,0 @@ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM android_fs - -#if !defined(_TRACE_ANDROID_FS_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_ANDROID_FS_H - -#include -#include - -DEFINE_EVENT(android_fs_data_start_template, android_fs_dataread_start, - TP_PROTO(struct inode *inode, loff_t offset, int bytes, - pid_t pid, char *pathname, char *command), - TP_ARGS(inode, offset, bytes, pid, pathname, command)); - -DEFINE_EVENT(android_fs_data_end_template, android_fs_dataread_end, - TP_PROTO(struct inode *inode, loff_t offset, int bytes), - TP_ARGS(inode, offset, bytes)); - -DEFINE_EVENT(android_fs_data_start_template, android_fs_datawrite_start, - TP_PROTO(struct inode *inode, loff_t offset, int bytes, - pid_t pid, char *pathname, char *command), - TP_ARGS(inode, offset, bytes, pid, pathname, command)); - -DEFINE_EVENT(android_fs_data_end_template, android_fs_datawrite_end, - TP_PROTO(struct inode *inode, loff_t offset, int bytes), - TP_ARGS(inode, offset, bytes)); - -DEFINE_EVENT(android_fs_fsync_start_template, android_fs_fsync_start, - TP_PROTO(struct inode *inode, - pid_t pid, char *pathname, char *command), - TP_ARGS(inode, pid, pathname, command)); - -DEFINE_EVENT(android_fs_data_end_template, android_fs_fsync_end, - TP_PROTO(struct inode *inode, loff_t offset, int bytes), - TP_ARGS(inode, offset, bytes)); - -#endif /* _TRACE_ANDROID_FS_H */ - -/* This part must be outside protection */ -#include - -#ifndef ANDROID_FSTRACE_GET_PATHNAME -#define ANDROID_FSTRACE_GET_PATHNAME - -/* Sizes an on-stack array, so careful if sizing this up ! */ -#define MAX_TRACE_PATHBUF_LEN 256 - -static inline char * -android_fstrace_get_pathname(char *buf, int buflen, struct inode *inode) -{ - char *path; - struct dentry *d; - - /* - * d_obtain_alias() will either iput() if it locates an existing - * dentry or transfer the reference to the new dentry created. - * So get an extra reference here. - */ - ihold(inode); - d = d_obtain_alias(inode); - if (likely(!IS_ERR(d))) { - path = dentry_path_raw(d, buf, buflen); - if (unlikely(IS_ERR(path))) { - strcpy(buf, "ERROR"); - path = buf; - } - dput(d); - } else { - strcpy(buf, "ERROR"); - path = buf; - } - return path; -} -#endif diff --git a/include/trace/events/android_fs_template.h b/include/trace/events/android_fs_template.h deleted file mode 100644 index 0832c26acaed..000000000000 --- a/include/trace/events/android_fs_template.h +++ /dev/null @@ -1,98 +0,0 @@ -#if !defined(_TRACE_ANDROID_FS_TEMPLATE_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_ANDROID_FS_TEMPLATE_H - -#include - -DECLARE_EVENT_CLASS(android_fs_data_start_template, - TP_PROTO(struct inode *inode, loff_t offset, int bytes, - pid_t pid, char *pathname, char *command), - TP_ARGS(inode, offset, bytes, pid, pathname, command), - TP_STRUCT__entry( - __string(pathbuf, pathname); - __field(loff_t, offset); - __field(int, bytes); - __field(loff_t, i_size); - __string(cmdline, command); - __field(pid_t, pid); - __field(ino_t, ino); - ), - TP_fast_assign( - { - /* - * Replace the spaces in filenames and cmdlines - * because this screws up the tooling that parses - * the traces. - */ - __assign_str(pathbuf, pathname); - (void)strreplace(__get_str(pathbuf), ' ', '_'); - __entry->offset = offset; - __entry->bytes = bytes; - __entry->i_size = i_size_read(inode); - __assign_str(cmdline, command); - (void)strreplace(__get_str(cmdline), ' ', '_'); - __entry->pid = pid; - __entry->ino = inode->i_ino; - } - ), - TP_printk("entry_name %s, offset %llu, bytes %d, cmdline %s," - " pid %d, i_size %llu, ino %lu", - __get_str(pathbuf), __entry->offset, __entry->bytes, - __get_str(cmdline), __entry->pid, __entry->i_size, - (unsigned long) __entry->ino) -); - -DECLARE_EVENT_CLASS(android_fs_data_end_template, - TP_PROTO(struct inode *inode, loff_t offset, int bytes), - TP_ARGS(inode, offset, bytes), - TP_STRUCT__entry( - __field(ino_t, ino); - __field(loff_t, offset); - __field(int, bytes); - ), - TP_fast_assign( - { - __entry->ino = inode->i_ino; - __entry->offset = offset; - __entry->bytes = bytes; - } - ), - TP_printk("ino %lu, offset %llu, bytes %d", - (unsigned long) __entry->ino, - __entry->offset, __entry->bytes) -); - -DECLARE_EVENT_CLASS(android_fs_fsync_start_template, - TP_PROTO(struct inode *inode, - pid_t pid, char *pathname, char *command), - TP_ARGS(inode, pid, pathname, command), - TP_STRUCT__entry( - __string(pathbuf, pathname); - __field(loff_t, i_size); - __string(cmdline, command); - __field(pid_t, pid); - __field(ino_t, ino); - ), - TP_fast_assign( - { - /* - * Replace the spaces in filenames and cmdlines - * because this screws up the tooling that parses - * the traces. - */ - __assign_str(pathbuf, pathname); - (void)strreplace(__get_str(pathbuf), ' ', '_'); - __entry->i_size = i_size_read(inode); - __assign_str(cmdline, command); - (void)strreplace(__get_str(cmdline), ' ', '_'); - __entry->pid = pid; - __entry->ino = inode->i_ino; - } - ), - TP_printk("entry_name %s, cmdline %s," - " pid %d, i_size %llu, ino %lu", - __get_str(pathbuf), - __get_str(cmdline), __entry->pid, __entry->i_size, - (unsigned long) __entry->ino) -); - -#endif /* _TRACE_ANDROID_FS_TEMPLATE_H */ From ad2c7aeb92b3d0f6fcfcbbcb684909481957d42e Mon Sep 17 00:00:00 2001 From: Jack Pham Date: Tue, 1 Dec 2020 10:24:26 -0800 Subject: [PATCH 565/580] usb: gadget: f_ncm: Revert to upstream Revert the following changes to restore to upstream version: commit d50dafdc0321 ("usb: gadget: f_ncm: allocate/free net device upon driver bind/unbind") commit 2b958bac040d ("usb: gadget: Add check gadget function bind or not") Reason for revert: Causes gether_set_ifname() support to kernel panic. Change-Id: I51064467cad63e47a4a9734f18f1a5b95fa8db86 --- drivers/usb/gadget/function/f_ncm.c | 55 ++++++++----------- .../usb/gadget/function/u_ether_configfs.h | 35 ------------ 2 files changed, 23 insertions(+), 67 deletions(-) diff --git a/drivers/usb/gadget/function/f_ncm.c b/drivers/usb/gadget/function/f_ncm.c index d9d0cebc37cf..495d31922858 100644 --- a/drivers/usb/gadget/function/f_ncm.c +++ b/drivers/usb/gadget/function/f_ncm.c @@ -1470,38 +1470,18 @@ static int ncm_bind(struct usb_configuration *c, struct usb_function *f) */ if (!ncm_opts->bound) { mutex_lock(&ncm_opts->lock); - ncm_opts->net = gether_setup_default(); - if (IS_ERR(ncm_opts->net)) { - status = PTR_ERR(ncm_opts->net); - mutex_unlock(&ncm_opts->lock); - goto error; - } gether_set_gadget(ncm_opts->net, cdev->gadget); status = gether_register_netdev(ncm_opts->net); mutex_unlock(&ncm_opts->lock); - if (status) { - free_netdev(ncm_opts->net); - goto error; - } + if (status) + goto fail; ncm_opts->bound = true; } - - /* export host's Ethernet address in CDC format */ - status = gether_get_host_addr_cdc(ncm_opts->net, ncm->ethaddr, - sizeof(ncm->ethaddr)); - if (status < 12) { /* strlen("01234567890a") */ - ERROR(cdev, "%s: failed to get host eth addr, err %d\n", - __func__, status); - status = -EINVAL; - goto netdev_cleanup; - } - ncm->port.ioport = netdev_priv(ncm_opts->net); - us = usb_gstrings_attach(cdev, ncm_strings, ARRAY_SIZE(ncm_string_defs)); if (IS_ERR(us)) { status = PTR_ERR(us); - goto netdev_cleanup; + goto fail; } ncm_control_intf.iInterface = us[STRING_CTRL_IDX].id; ncm_data_nop_intf.iInterface = us[STRING_DATA_IDX].id; @@ -1609,10 +1589,7 @@ static int ncm_bind(struct usb_configuration *c, struct usb_function *f) kfree(ncm->notify_req->buf); usb_ep_free_request(ncm->notify, ncm->notify_req); } -netdev_cleanup: - gether_cleanup(netdev_priv(ncm_opts->net)); -error: ERROR(cdev, "%s: can't bind, err %d\n", f->name, status); return status; @@ -1708,6 +1685,8 @@ static void ncm_free_inst(struct usb_function_instance *f) opts = container_of(f, struct f_ncm_opts, func_inst); if (opts->bound) gether_cleanup(netdev_priv(opts->net)); + else + free_netdev(opts->net); kfree(opts->ncm_interf_group); kfree(opts); } @@ -1726,6 +1705,12 @@ static struct usb_function_instance *ncm_alloc_inst(void) mutex_init(&opts->lock); opts->func_inst.free_func_inst = ncm_free_inst; + opts->net = gether_setup_default(); + if (IS_ERR(opts->net)) { + struct net_device *net = opts->net; + kfree(opts); + return ERR_CAST(net); + } INIT_LIST_HEAD(&opts->ncm_os_desc.ext_prop); descs[0] = &opts->ncm_os_desc; @@ -1768,13 +1753,9 @@ static void ncm_free(struct usb_function *f) static void ncm_unbind(struct usb_configuration *c, struct usb_function *f) { struct f_ncm *ncm = func_to_ncm(f); - struct f_ncm_opts *opts = container_of(f->fi, struct f_ncm_opts, - func_inst); DBG(c->cdev, "ncm unbind\n"); - opts->bound = false; - hrtimer_cancel(&ncm->task_timer); kfree(f->os_desc_table); @@ -1790,14 +1771,13 @@ static void ncm_unbind(struct usb_configuration *c, struct usb_function *f) kfree(ncm->notify_req->buf); usb_ep_free_request(ncm->notify, ncm->notify_req); - - gether_cleanup(netdev_priv(opts->net)); } static struct usb_function *ncm_alloc(struct usb_function_instance *fi) { struct f_ncm *ncm; struct f_ncm_opts *opts; + int status; /* allocate and initialize one new instance */ ncm = kzalloc(sizeof(*ncm), GFP_KERNEL); @@ -1807,9 +1787,20 @@ static struct usb_function *ncm_alloc(struct usb_function_instance *fi) opts = container_of(fi, struct f_ncm_opts, func_inst); mutex_lock(&opts->lock); opts->refcnt++; + + /* export host's Ethernet address in CDC format */ + status = gether_get_host_addr_cdc(opts->net, ncm->ethaddr, + sizeof(ncm->ethaddr)); + if (status < 12) { /* strlen("01234567890a") */ + kfree(ncm); + mutex_unlock(&opts->lock); + return ERR_PTR(-EINVAL); + } ncm_string_defs[STRING_MAC_IDX].s = ncm->ethaddr; + spin_lock_init(&ncm->lock); ncm_reset_values(ncm); + ncm->port.ioport = netdev_priv(opts->net); mutex_unlock(&opts->lock); ncm->port.is_fixed = true; ncm->port.supports_multi_frame = true; diff --git a/drivers/usb/gadget/function/u_ether_configfs.h b/drivers/usb/gadget/function/u_ether_configfs.h index b701a63f771d..e70425a5dcb4 100644 --- a/drivers/usb/gadget/function/u_ether_configfs.h +++ b/drivers/usb/gadget/function/u_ether_configfs.h @@ -32,11 +32,6 @@ struct f_##_f_##_opts *opts = to_f_##_f_##_opts(item); \ int result; \ \ - if (opts->bound == false) { \ - pr_err("Gadget function do not bind yet.\n"); \ - return -ENODEV; \ - } \ - \ mutex_lock(&opts->lock); \ result = gether_get_dev_addr(opts->net, page, PAGE_SIZE); \ mutex_unlock(&opts->lock); \ @@ -50,11 +45,6 @@ struct f_##_f_##_opts *opts = to_f_##_f_##_opts(item); \ int ret; \ \ - if (opts->bound == false) { \ - pr_err("Gadget function do not bind yet.\n"); \ - return -ENODEV; \ - } \ - \ mutex_lock(&opts->lock); \ if (opts->refcnt) { \ mutex_unlock(&opts->lock); \ @@ -77,11 +67,6 @@ struct f_##_f_##_opts *opts = to_f_##_f_##_opts(item); \ int result; \ \ - if (opts->bound == false) { \ - pr_err("Gadget function do not bind yet.\n"); \ - return -ENODEV; \ - } \ - \ mutex_lock(&opts->lock); \ result = gether_get_host_addr(opts->net, page, PAGE_SIZE); \ mutex_unlock(&opts->lock); \ @@ -95,11 +80,6 @@ struct f_##_f_##_opts *opts = to_f_##_f_##_opts(item); \ int ret; \ \ - if (opts->bound == false) { \ - pr_err("Gadget function do not bind yet.\n"); \ - return -ENODEV; \ - } \ - \ mutex_lock(&opts->lock); \ if (opts->refcnt) { \ mutex_unlock(&opts->lock); \ @@ -122,11 +102,6 @@ struct f_##_f_##_opts *opts = to_f_##_f_##_opts(item); \ unsigned qmult; \ \ - if (opts->bound == false) { \ - pr_err("Gadget function do not bind yet.\n"); \ - return -ENODEV; \ - } \ - \ mutex_lock(&opts->lock); \ qmult = gether_get_qmult(opts->net); \ mutex_unlock(&opts->lock); \ @@ -140,11 +115,6 @@ u8 val; \ int ret; \ \ - if (opts->bound == false) { \ - pr_err("Gadget function do not bind yet.\n"); \ - return -ENODEV; \ - } \ - \ mutex_lock(&opts->lock); \ if (opts->refcnt) { \ ret = -EBUSY; \ @@ -171,11 +141,6 @@ out: \ struct f_##_f_##_opts *opts = to_f_##_f_##_opts(item); \ int ret; \ \ - if (opts->bound == false) { \ - pr_err("Gadget function do not bind yet.\n"); \ - return -ENODEV; \ - } \ - \ mutex_lock(&opts->lock); \ ret = gether_get_ifname(opts->net, page, PAGE_SIZE); \ mutex_unlock(&opts->lock); \ From 1e664bd7b6d9444f42304908a372654620e778f4 Mon Sep 17 00:00:00 2001 From: Hardik Gajjar Date: Fri, 20 Oct 2023 17:33:24 +0200 Subject: [PATCH 566/580] usb: gadget: f_ncm: Always set current gadget in ncm_bind() [ Upstream commit a04224da1f3424b2c607b12a3bd1f0e302fb8231 ] Previously, gadget assignment to the net device occurred exclusively during the initial binding attempt. Nevertheless, the gadget pointer could change during bind/unbind cycles due to various conditions, including the unloading/loading of the UDC device driver or the detachment/reconnection of an OTG-capable USB hub device. This patch relocates the gether_set_gadget() function out from ncm_opts->bound condition check, ensuring that the correct gadget is assigned during each bind request. The provided logs demonstrate the consistency of ncm_opts throughout the power cycle, while the gadget may change. * OTG hub connected during boot up and assignment of gadget and ncm_opts pointer [ 2.366301] usb 2-1.5: New USB device found, idVendor=2996, idProduct=0105 [ 2.366304] usb 2-1.5: New USB device strings: Mfr=1, Product=2, SerialNumber=3 [ 2.366306] usb 2-1.5: Product: H2H Bridge [ 2.366308] usb 2-1.5: Manufacturer: Aptiv [ 2.366309] usb 2-1.5: SerialNumber: 13FEB2021 [ 2.427989] usb 2-1.5: New USB device found, VID=2996, PID=0105 [ 2.428959] dabridge 2-1.5:1.0: dabridge 2-4 total endpoints=5, 0000000093a8d681 [ 2.429710] dabridge 2-1.5:1.0: P(0105) D(22.06.22) F(17.3.16) H(1.1) high-speed [ 2.429714] dabridge 2-1.5:1.0: Hub 2-2 P(0151) V(06.87) [ 2.429956] dabridge 2-1.5:1.0: All downstream ports in host mode [ 2.430093] gadget 000000003c414d59 ------> gadget pointer * NCM opts and associated gadget pointer during First ncm_bind [ 34.763929] NCM opts 00000000aa304ac9 [ 34.763930] NCM gadget 000000003c414d59 * OTG capable hub disconnecte or assume driver unload. [ 97.203114] usb 2-1: USB disconnect, device number 2 [ 97.203118] usb 2-1.1: USB disconnect, device number 3 [ 97.209217] usb 2-1.5: USB disconnect, device number 4 [ 97.230990] dabr_udc deleted * Reconnect the OTG hub or load driver assaign new gadget pointer. [ 111.534035] usb 2-1.1: New USB device found, idVendor=2996, idProduct=0120, bcdDevice= 6.87 [ 111.534038] usb 2-1.1: New USB device strings: Mfr=1, Product=2, SerialNumber=3 [ 111.534040] usb 2-1.1: Product: Vendor [ 111.534041] usb 2-1.1: Manufacturer: Aptiv [ 111.534042] usb 2-1.1: SerialNumber: Superior [ 111.535175] usb 2-1.1: New USB device found, VID=2996, PID=0120 [ 111.610995] usb 2-1.5: new high-speed USB device number 8 using xhci-hcd [ 111.630052] usb 2-1.5: New USB device found, idVendor=2996, idProduct=0105, bcdDevice=21.02 [ 111.630055] usb 2-1.5: New USB device strings: Mfr=1, Product=2, SerialNumber=3 [ 111.630057] usb 2-1.5: Product: H2H Bridge [ 111.630058] usb 2-1.5: Manufacturer: Aptiv [ 111.630059] usb 2-1.5: SerialNumber: 13FEB2021 [ 111.687464] usb 2-1.5: New USB device found, VID=2996, PID=0105 [ 111.690375] dabridge 2-1.5:1.0: dabridge 2-8 total endpoints=5, 000000000d87c961 [ 111.691172] dabridge 2-1.5:1.0: P(0105) D(22.06.22) F(17.3.16) H(1.1) high-speed [ 111.691176] dabridge 2-1.5:1.0: Hub 2-6 P(0151) V(06.87) [ 111.691646] dabridge 2-1.5:1.0: All downstream ports in host mode [ 111.692298] gadget 00000000dc72f7a9 --------> new gadget ptr on connect * NCM opts and associated gadget pointer during second ncm_bind [ 113.271786] NCM opts 00000000aa304ac9 -----> same opts ptr used during first bind [ 113.271788] NCM gadget 00000000dc72f7a9 ----> however new gaget ptr, that will not set in net_device due to ncm_opts->bound = true Change-Id: I803f892ccd2a0f9558d4f32d8a3104aba78353ff Signed-off-by: Hardik Gajjar Link: https://lore.kernel.org/r/20231020153324.82794-1-hgajjar@de.adit-jv.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/usb/gadget/function/f_ncm.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/drivers/usb/gadget/function/f_ncm.c b/drivers/usb/gadget/function/f_ncm.c index 495d31922858..44bddfc75bb2 100644 --- a/drivers/usb/gadget/function/f_ncm.c +++ b/drivers/usb/gadget/function/f_ncm.c @@ -1461,22 +1461,17 @@ static int ncm_bind(struct usb_configuration *c, struct usb_function *f) f->os_desc_table[0].os_desc = &ncm_opts->ncm_os_desc; } - /* - * in drivers/usb/gadget/configfs.c:configfs_composite_bind() - * configurations are bound in sequence with list_for_each_entry, - * in each configuration its functions are bound in sequence - * with list_for_each_entry, so we assume no race condition - * with regard to ncm_opts->bound access - */ - if (!ncm_opts->bound) { - mutex_lock(&ncm_opts->lock); - gether_set_gadget(ncm_opts->net, cdev->gadget); + mutex_lock(&ncm_opts->lock); + gether_set_gadget(ncm_opts->net, cdev->gadget); + if (!ncm_opts->bound) status = gether_register_netdev(ncm_opts->net); - mutex_unlock(&ncm_opts->lock); - if (status) - goto fail; - ncm_opts->bound = true; - } + mutex_unlock(&ncm_opts->lock); + + if (status) + goto fail; + + ncm_opts->bound = true; + us = usb_gstrings_attach(cdev, ncm_strings, ARRAY_SIZE(ncm_string_defs)); if (IS_ERR(us)) { From 5107711a723479717ae721c905c6d6ff5b51659c Mon Sep 17 00:00:00 2001 From: Daniel M German Date: Wed, 19 Jun 2019 21:50:38 -0700 Subject: [PATCH 567/580] usb: Replace snprintf with scnprintf in gether_get_ifname [ Upstream commit 37e444c8296c14cb5768a1197b24cfc07ee8e0cd ] snprintf returns the actual length of the buffer created; however, this is not the case if snprintf truncates its parameter. See https://lwn.net/Articles/69419/ for a detailed explanation. The current code correctly handles this case at the expense of extra code in the return statement. scnprintf does returns the actual length of the buffer created making the ?: operator unnecessary in the return statement. This change does not alter the functionality of the code. Change-Id: I518abb00c6bd19d60d95a0ba8577a174feaaa3f3 Signed-off-by: Daniel M German Signed-off-by: Felipe Balbi --- drivers/usb/gadget/function/u_ether.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/function/u_ether.c b/drivers/usb/gadget/function/u_ether.c index 3d71ed861a84..782737c95d9e 100644 --- a/drivers/usb/gadget/function/u_ether.c +++ b/drivers/usb/gadget/function/u_ether.c @@ -984,9 +984,9 @@ int gether_get_ifname(struct net_device *net, char *name, int len) int ret; rtnl_lock(); - ret = snprintf(name, len, "%s\n", netdev_name(net)); + ret = scnprintf(name, len, "%s\n", netdev_name(net)); rtnl_unlock(); - return ret < len ? ret : len; + return ret; } EXPORT_SYMBOL_GPL(gether_get_ifname); From cc216f0ab158820f935483095f5e350e8ea291da Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Thu, 14 Jan 2021 08:42:22 +0900 Subject: [PATCH 568/580] FROMGIT: usb: gadget: u_ether: support configuring interface names. This patch allows the administrator to configure the interface name of a function using u_ether (e.g., eem, ncm, rndis). Currently, all such interfaces, regardless of function type, are always called usb0, usb1, etc. This makes it very cumbersome to use more than one such type at a time, because userspace cannnot easily tell the interfaces apart and apply the right configuration to each one. Interface renaming in userspace based on driver doesn't help, because the interfaces all have the same driver. Without this patch, doing this require hacks/workarounds such as setting fixed MAC addresses on the functions, and then renaming by MAC address, or scraping configfs after each interface is created to find out what it is. Setting the interface name is done by writing to the same "ifname" configfs attribute that reports the interface name after the function is bound. The write must contain an interface pattern such as "usb%d" (which will cause the net core to pick the next available interface name starting with "usb"). This patch does not allow writing an exact interface name (as opposed to a pattern) because if the interface already exists at bind time, the bind will fail and the whole gadget will fail to activate. This could be allowed in a future patch. For compatibility with current userspace, when reading an ifname that has not currently been set, the result is still "(unnamed net_device)". Once a write to ifname happens, then reading ifname will return whatever was last written. Tested by configuring an rndis function and an ncm function on the same gadget, and writing "rndis%d" to ifname on the rndis function and "ncm%d" to ifname on the ncm function. When the gadget was bound, the rndis interface was rndis0 and the ncm interface was ncm0. Signed-off-by: Lorenzo Colitti (cherry picked from commit 63d152149b2d0860ccf8c4e6596b6175b2b7ace6 https://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git usb-next) Link: https://lore.kernel.org/r/20210113234222.3272933-1-lorenzo@google.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Lorenzo Colitti Change-Id: I04deb6cc1d8a5b8ee82404940de2a79c06fbafe7 Signed-off-by: Greg Kroah-Hartman --- Documentation/usb/gadget-testing.txt | 30 ++++++++--------- drivers/usb/gadget/function/u_ether.c | 33 ++++++++++++++++++- drivers/usb/gadget/function/u_ether.h | 12 +++++++ .../usb/gadget/function/u_ether_configfs.h | 15 ++++++++- 4 files changed, 73 insertions(+), 17 deletions(-) diff --git a/Documentation/usb/gadget-testing.txt b/Documentation/usb/gadget-testing.txt index 5908a21fddb6..2e309499e2bb 100644 --- a/Documentation/usb/gadget-testing.txt +++ b/Documentation/usb/gadget-testing.txt @@ -73,9 +73,9 @@ The ECM function provides these attributes in its function directory: and after creating the functions/ecm. they contain default values: qmult is 5, dev_addr and host_addr are randomly selected. -Except for ifname they can be written to until the function is linked to a -configuration. The ifname is read-only and contains the name of the interface -which was assigned by the net core, e. g. usb0. +The ifname can be written to if the function is not bound. A write must be an +interface pattern such as "usb%d", which will cause the net core to choose the +next free usbX interface. By default, it is set to "usb%d". Testing the ECM function ------------------------ @@ -106,9 +106,9 @@ The ECM subset function provides these attributes in its function directory: and after creating the functions/ecm. they contain default values: qmult is 5, dev_addr and host_addr are randomly selected. -Except for ifname they can be written to until the function is linked to a -configuration. The ifname is read-only and contains the name of the interface -which was assigned by the net core, e. g. usb0. +The ifname can be written to if the function is not bound. A write must be an +interface pattern such as "usb%d", which will cause the net core to choose the +next free usbX interface. By default, it is set to "usb%d". Testing the ECM subset function ------------------------------- @@ -139,9 +139,9 @@ The EEM function provides these attributes in its function directory: and after creating the functions/eem. they contain default values: qmult is 5, dev_addr and host_addr are randomly selected. -Except for ifname they can be written to until the function is linked to a -configuration. The ifname is read-only and contains the name of the interface -which was assigned by the net core, e. g. usb0. +The ifname can be written to if the function is not bound. A write must be an +interface pattern such as "usb%d", which will cause the net core to choose the +next free usbX interface. By default, it is set to "usb%d". Testing the EEM function ------------------------ @@ -397,9 +397,9 @@ The NCM function provides these attributes in its function directory: and after creating the functions/ncm. they contain default values: qmult is 5, dev_addr and host_addr are randomly selected. -Except for ifname they can be written to until the function is linked to a -configuration. The ifname is read-only and contains the name of the interface -which was assigned by the net core, e. g. usb0. +The ifname can be written to if the function is not bound. A write must be an +interface pattern such as "usb%d", which will cause the net core to choose the +next free usbX interface. By default, it is set to "usb%d". Testing the NCM function ------------------------ @@ -521,9 +521,9 @@ The RNDIS function provides these attributes in its function directory: and after creating the functions/rndis. they contain default values: qmult is 5, dev_addr and host_addr are randomly selected. -Except for ifname they can be written to until the function is linked to a -configuration. The ifname is read-only and contains the name of the interface -which was assigned by the net core, e. g. usb0. +The ifname can be written to if the function is not bound. A write must be an +interface pattern such as "usb%d", which will cause the net core to choose the +next free usbX interface. By default, it is set to "usb%d". Testing the RNDIS function -------------------------- diff --git a/drivers/usb/gadget/function/u_ether.c b/drivers/usb/gadget/function/u_ether.c index 782737c95d9e..d296d8f92808 100644 --- a/drivers/usb/gadget/function/u_ether.c +++ b/drivers/usb/gadget/function/u_ether.c @@ -83,6 +83,7 @@ struct eth_dev { bool zlp; bool no_skb_reserve; + bool ifname_set; u8 host_mac[ETH_ALEN]; u8 dev_mac[ETH_ALEN]; }; @@ -981,15 +982,45 @@ EXPORT_SYMBOL_GPL(gether_get_qmult); int gether_get_ifname(struct net_device *net, char *name, int len) { + struct eth_dev *dev = netdev_priv(net); int ret; rtnl_lock(); - ret = scnprintf(name, len, "%s\n", netdev_name(net)); + ret = scnprintf(name, len, "%s\n", + dev->ifname_set ? net->name : netdev_name(net)); rtnl_unlock(); return ret; } EXPORT_SYMBOL_GPL(gether_get_ifname); +int gether_set_ifname(struct net_device *net, const char *name, int len) +{ + struct eth_dev *dev = netdev_priv(net); + char tmp[IFNAMSIZ]; + const char *p; + + if (name[len - 1] == '\n') + len--; + + if (len >= sizeof(tmp)) + return -E2BIG; + + strscpy(tmp, name, len + 1); + if (!dev_valid_name(tmp)) + return -EINVAL; + + /* Require exactly one %d, so binding will not fail with EEXIST. */ + p = strchr(name, '%'); + if (!p || p[1] != 'd' || strchr(p + 2, '%')) + return -EINVAL; + + strncpy(net->name, tmp, sizeof(net->name)); + dev->ifname_set = true; + + return 0; +} +EXPORT_SYMBOL_GPL(gether_set_ifname); + unsigned int gether_get_ul_max_pkts_per_xfer(struct net_device *net) { struct eth_dev *dev; diff --git a/drivers/usb/gadget/function/u_ether.h b/drivers/usb/gadget/function/u_ether.h index 35810229ca48..d12becb97e43 100644 --- a/drivers/usb/gadget/function/u_ether.h +++ b/drivers/usb/gadget/function/u_ether.h @@ -248,6 +248,18 @@ unsigned gether_get_qmult(struct net_device *net); */ int gether_get_ifname(struct net_device *net, char *name, int len); +/** + * gether_set_ifname - set an ethernet-over-usb link interface name + * @net: device representing this link + * @name: new interface name + * @len: length of @name + * + * This sets the interface name of this ethernet-over-usb link. + * A single terminating newline, if any, is ignored. + * Returns zero on success, else negative errno. + */ +int gether_set_ifname(struct net_device *net, const char *name, int len); + /** * gether_get_ul_max_pkts_per_xfer - get max pks/xfer for UL aggrregarion * @net: device representing this link diff --git a/drivers/usb/gadget/function/u_ether_configfs.h b/drivers/usb/gadget/function/u_ether_configfs.h index e70425a5dcb4..6da88d254887 100644 --- a/drivers/usb/gadget/function/u_ether_configfs.h +++ b/drivers/usb/gadget/function/u_ether_configfs.h @@ -148,7 +148,20 @@ out: \ return ret; \ } \ \ - CONFIGFS_ATTR_RO(_f_##_opts_, ifname) + static ssize_t _f_##_opts_ifname_store(struct config_item *item, \ + const char *page, size_t len)\ + { \ + struct f_##_f_##_opts *opts = to_f_##_f_##_opts(item); \ + int ret = -EBUSY; \ + \ + mutex_lock(&opts->lock); \ + if (!opts->refcnt) \ + ret = gether_set_ifname(opts->net, page, len); \ + mutex_unlock(&opts->lock); \ + return ret ?: len; \ + } \ + \ + CONFIGFS_ATTR(_f_##_opts_, ifname) #define USB_ETHER_CONFIGFS_ITEM_ATTR_U8_RW(_f_, _n_) \ static ssize_t _f_##_opts_##_n_##_show(struct config_item *item,\ From 11825792784e0c76e01b855279993839c6ac8843 Mon Sep 17 00:00:00 2001 From: Hridaya Prajapati Date: Mon, 28 Apr 2025 11:07:15 +0545 Subject: [PATCH 569/580] Revert "ANDROID: fs: fuse: Freeze client on suspend when request sent to userspace" commit e61b11a7370f7da5a0dc6a17d9d3941073c51303 ("ANDROID: fuse: Allocate zeroed memory for canonical path") fixes the deadlock that this patch tries to fix. This reverts commit aeb761e164439419e10b8006322268f73cd738e7. Change-Id: I7137a93cf83b4d2a3cecb2b2de6f9fa14e6a2702 --- fs/fuse/dev.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 5ecaa8cc02dc..8a772b792e25 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -22,7 +22,6 @@ #include #include #include -#include MODULE_ALIAS_MISCDEV(FUSE_MINOR); MODULE_ALIAS("devname:fuse"); @@ -488,9 +487,7 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) * Either request is already in userspace, or it was forced. * Wait it out. */ - while (!test_bit(FR_FINISHED, &req->flags)) - wait_event_freezable(req->waitq, - test_bit(FR_FINISHED, &req->flags)); + wait_event(req->waitq, test_bit(FR_FINISHED, &req->flags)); } static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) From 7ffa90dc0e9a2726867c55185596d0dfee5e9ced Mon Sep 17 00:00:00 2001 From: stic-server-open <1138705738@qq.com> Date: Wed, 30 Apr 2025 22:25:31 +0800 Subject: [PATCH 570/580] Revert "f2fs: avoid build warnining in extent_cache" * Reapply new patch from upstream kernel. This reverts commit 908a46704da7392e0f2d93dfd47e52c967a6672d. Change-Id: I2030f66360dcf5b67d067339945ca2ea5ffbd3f6 Signed-off-by: zhaoyuenan --- fs/f2fs/extent_cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index ccdc4d0c70c8..111b754d7950 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -673,7 +673,7 @@ static void __update_extent_tree_range(struct inode *inode, struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; struct extent_node *en = NULL, *en1 = NULL; struct extent_node *prev_en = NULL, *next_en = NULL; - struct extent_info ei, dei, prev = {}; + struct extent_info ei, dei, prev; struct rb_node **insert_p = NULL, *insert_parent = NULL; unsigned int fofs = tei->fofs, len = tei->len; unsigned int end = fofs + len; From ef24b96cf60a0b2b832862369b62176ed6f27b22 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 16 Dec 2022 14:05:44 -0800 Subject: [PATCH 571/580] UPSTREAM: f2fs: initialize extent_cache parameter This can avoid confusing tracepoint values. Signed-off-by: Jaegeuk Kim (cherry picked from commit fe59109ae5c0b34a8c7c07f693fc501b12b57787) Change-Id: I64d127fed283cfe93579a031b1b8762d35069f98 Signed-off-by: zhaoyuenan --- fs/f2fs/data.c | 2 +- fs/f2fs/extent_cache.c | 2 +- fs/f2fs/file.c | 2 +- fs/f2fs/segment.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 2f12ca2d43ed..afd0dea1ab2d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2243,7 +2243,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, sector_t last_block_in_file; const unsigned blocksize = blks_to_bytes(inode, 1); struct decompress_io_ctx *dic = NULL; - struct extent_info ei = {0, }; + struct extent_info ei = {}; bool from_dnode = true; int i; int ret = 0; diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 111b754d7950..8c6bc4c61cee 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -939,7 +939,7 @@ static int __get_new_block_age(struct inode *inode, struct extent_info *ei) static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type type) { - struct extent_info ei; + struct extent_info ei = {}; if (!__may_extent_tree(dn->inode, type)) return; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index eda802839704..4aab3c0b68fc 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2616,7 +2616,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, struct f2fs_map_blocks map = { .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE, .m_may_create = false }; - struct extent_info ei = {0, }; + struct extent_info ei = {}; pgoff_t pg_start, pg_end, next_pgofs; unsigned int blk_per_seg = sbi->blocks_per_seg; unsigned int total = 0, sec_num; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b5d3ee534168..ac6f72a80d74 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3191,7 +3191,7 @@ static int __get_segment_type_4(struct f2fs_io_info *fio) static int __get_age_segment_type(struct inode *inode, pgoff_t pgofs) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_info ei; + struct extent_info ei = {}; if (f2fs_lookup_age_extent_cache(inode, pgofs, &ei)) { if (!ei.age) From bfef390e6c7b754d1a72191dd265fb7d51aff68b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 16 Dec 2022 14:41:54 -0800 Subject: [PATCH 572/580] UPSTREAM: f2fs: don't mix to use union values in extent_info Let's explicitly use the defined values in block_age case only. Signed-off-by: Jaegeuk Kim (cherry picked from commit ed2724765e58e3310d3de48f4a1761631b3dd640) Change-Id: Ibbed9907e98c54a7fc807f5deb9300aeb2d35e70 Signed-off-by: zhaoyuenan --- fs/f2fs/extent_cache.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 8c6bc4c61cee..15cbd9db4bb2 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -882,7 +882,8 @@ static unsigned long long __calculate_block_age(unsigned long long new, } /* This returns a new age and allocated blocks in ei */ -static int __get_new_block_age(struct inode *inode, struct extent_info *ei) +static int __get_new_block_age(struct inode *inode, struct extent_info *ei, + block_t blkaddr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); loff_t f_size = i_size_read(inode); @@ -895,7 +896,7 @@ static int __get_new_block_age(struct inode *inode, struct extent_info *ei) * block here. */ if ((f_size >> PAGE_SHIFT) == ei->fofs && f_size & (PAGE_SIZE - 1) && - ei->blk == NEW_ADDR) + blkaddr == NEW_ADDR) return -EINVAL; if (__lookup_extent_tree(inode, ei->fofs, ei, EX_BLOCK_AGE)) { @@ -916,14 +917,14 @@ static int __get_new_block_age(struct inode *inode, struct extent_info *ei) return 0; } - f2fs_bug_on(sbi, ei->blk == NULL_ADDR); + f2fs_bug_on(sbi, blkaddr == NULL_ADDR); /* the data block was allocated for the first time */ - if (ei->blk == NEW_ADDR) + if (blkaddr == NEW_ADDR) goto out; - if (__is_valid_data_blkaddr(ei->blk) && - !f2fs_is_valid_blkaddr(sbi, ei->blk, DATA_GENERIC_ENHANCE)) { + if (__is_valid_data_blkaddr(blkaddr) && + !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) { f2fs_bug_on(sbi, 1); return -EINVAL; } @@ -954,8 +955,7 @@ static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type typ else ei.blk = dn->data_blkaddr; } else if (type == EX_BLOCK_AGE) { - ei.blk = dn->data_blkaddr; - if (__get_new_block_age(dn->inode, &ei)) + if (__get_new_block_age(dn->inode, &ei, dn->data_blkaddr)) return; } __update_extent_tree_range(dn->inode, &ei, type); From 82789d675e85d5240280b3bad809a7adb1310d62 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 16 Dec 2022 16:36:36 -0800 Subject: [PATCH 573/580] UPSTREAM: f2fs: should use a temp extent_info for lookup Otherwise, __lookup_extent_tree() will override the given extent_info which will be used by caller. Signed-off-by: Jaegeuk Kim (cherry picked from commit 22a341b43036415718f2d50f5f98b2f891fe17e9) Change-Id: I5ddc08dd3d299f222b4b9b1334f2e675e37956b6 Signed-off-by: zhaoyuenan --- fs/f2fs/extent_cache.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 15cbd9db4bb2..9349cc7e57e1 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -889,6 +889,7 @@ static int __get_new_block_age(struct inode *inode, struct extent_info *ei, loff_t f_size = i_size_read(inode); unsigned long long cur_blocks = atomic64_read(&sbi->allocated_data_blocks); + struct extent_info tei = *ei; /* only fofs and len are valid */ /* * When I/O is not aligned to a PAGE_SIZE, update will happen to the last @@ -899,17 +900,17 @@ static int __get_new_block_age(struct inode *inode, struct extent_info *ei, blkaddr == NEW_ADDR) return -EINVAL; - if (__lookup_extent_tree(inode, ei->fofs, ei, EX_BLOCK_AGE)) { + if (__lookup_extent_tree(inode, ei->fofs, &tei, EX_BLOCK_AGE)) { unsigned long long cur_age; - if (cur_blocks >= ei->last_blocks) - cur_age = cur_blocks - ei->last_blocks; + if (cur_blocks >= tei.last_blocks) + cur_age = cur_blocks - tei.last_blocks; else /* allocated_data_blocks overflow */ - cur_age = ULLONG_MAX - ei->last_blocks + cur_blocks; + cur_age = ULLONG_MAX - tei.last_blocks + cur_blocks; - if (ei->age) - ei->age = __calculate_block_age(cur_age, ei->age); + if (tei.age) + ei->age = __calculate_block_age(cur_age, tei.age); else ei->age = cur_age; ei->last_blocks = cur_blocks; From 976739516dea264eadc2686972cbe92d0772b837 Mon Sep 17 00:00:00 2001 From: qixiaoyu1 Date: Thu, 2 Feb 2023 16:20:27 +0800 Subject: [PATCH 574/580] UPSTREAM: f2fs: fix wrong calculation of block age Currently we wrongly calculate the new block age to old * LAST_AGE_WEIGHT / 100. Fix it to new * (100 - LAST_AGE_WEIGHT) / 100 + old * LAST_AGE_WEIGHT / 100. Signed-off-by: qixiaoyu1 Signed-off-by: xiongping1 Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim (cherry picked from commit b03a41a495df35f8e8d25220878bd6b8472d9396) Change-Id: I989877caf2517245750e7ed809b2af71f8943f96 Signed-off-by: zhaoyuenan --- fs/f2fs/extent_cache.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 9349cc7e57e1..db33158812aa 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -874,11 +874,18 @@ void f2fs_update_read_extent_tree_range_compressed(struct inode *inode, static unsigned long long __calculate_block_age(unsigned long long new, unsigned long long old) { - unsigned long long diff; + unsigned int rem_old, rem_new; + unsigned long long res; - diff = (new >= old) ? new - (new - old) : new + (old - new); + res = div_u64_rem(new, 100, &rem_new) * (100 - LAST_AGE_WEIGHT) + + div_u64_rem(old, 100, &rem_old) * LAST_AGE_WEIGHT; - return div_u64(diff * LAST_AGE_WEIGHT, 100); + if (rem_new) + res += rem_new * (100 - LAST_AGE_WEIGHT) / 100; + if (rem_old) + res += rem_old * LAST_AGE_WEIGHT / 100; + + return res; } /* This returns a new age and allocated blocks in ei */ From a23f404b6d08ea927565191e9cd029928e75d972 Mon Sep 17 00:00:00 2001 From: qixiaoyu1 Date: Wed, 30 Apr 2025 21:54:35 +0800 Subject: [PATCH 575/580] UPSTREAM: f2fs: add sysfs nodes to set last_age_weight Signed-off-by: qixiaoyu1 Signed-off-by: xiongping1 Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim (cherry picked from commit d23be468eada21c828058e0e8d60409eaec373ab) Change-Id: I4c0a5137cbb6c70dc8ebfe6e1dc7377ae20898fa Signed-off-by: zhaoyuenan --- Documentation/ABI/testing/sysfs-fs-f2fs | 5 +++++ fs/f2fs/extent_cache.c | 15 +++++++++------ fs/f2fs/f2fs.h | 1 + fs/f2fs/sysfs.c | 11 +++++++++++ 4 files changed, 26 insertions(+), 6 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index ae2ab41e07cb..0774d3ac3093 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -644,3 +644,8 @@ Contact: "Ping Xiong" Description: When DATA SEPARATION is on, it controls the age threshold to indicate the data blocks as warm. By default it was initialized as 2621440 blocks (equals to 10GB). + +What: /sys/fs/f2fs//last_age_weight +Date: January 2023 +Contact: "Ping Xiong" +Description: When DATA SEPARATION is on, it controls the weight of last data block age. diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index db33158812aa..4cd8cc5bc0b2 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -871,19 +871,21 @@ void f2fs_update_read_extent_tree_range_compressed(struct inode *inode, } #endif -static unsigned long long __calculate_block_age(unsigned long long new, +static unsigned long long __calculate_block_age(struct f2fs_sb_info *sbi, + unsigned long long new, unsigned long long old) { unsigned int rem_old, rem_new; unsigned long long res; + unsigned int weight = sbi->last_age_weight; - res = div_u64_rem(new, 100, &rem_new) * (100 - LAST_AGE_WEIGHT) - + div_u64_rem(old, 100, &rem_old) * LAST_AGE_WEIGHT; + res = div_u64_rem(new, 100, &rem_new) * (100 - weight) + + div_u64_rem(old, 100, &rem_old) * weight; if (rem_new) - res += rem_new * (100 - LAST_AGE_WEIGHT) / 100; + res += rem_new * (100 - weight) / 100; if (rem_old) - res += rem_old * LAST_AGE_WEIGHT / 100; + res += rem_old * weight / 100; return res; } @@ -917,7 +919,7 @@ static int __get_new_block_age(struct inode *inode, struct extent_info *ei, cur_age = ULLONG_MAX - tei.last_blocks + cur_blocks; if (tei.age) - ei->age = __calculate_block_age(cur_age, tei.age); + ei->age = __calculate_block_age(sbi, cur_age, tei.age); else ei->age = cur_age; ei->last_blocks = cur_blocks; @@ -1234,6 +1236,7 @@ void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi) atomic64_set(&sbi->allocated_data_blocks, 0); sbi->hot_data_age_threshold = DEF_HOT_DATA_AGE_THRESHOLD; sbi->warm_data_age_threshold = DEF_WARM_DATA_AGE_THRESHOLD; + sbi->last_age_weight = LAST_AGE_WEIGHT; } int __init f2fs_create_extent_cache(void) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2806d9b644a2..b9d52cb5e5d5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1683,6 +1683,7 @@ struct f2fs_sb_info { /* The threshold used for hot and warm data seperation*/ unsigned int hot_data_age_threshold; unsigned int warm_data_age_threshold; + unsigned int last_age_weight; /* basic filesystem units */ unsigned int log_sectors_per_block; /* log2 sectors per block */ diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 2a1566ce4a0d..972a15a88fe3 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -687,6 +687,15 @@ static ssize_t __sbi_store(struct f2fs_attr *a, return count; } + if (!strcmp(a->attr.name, "last_age_weight")) { + if (t > 100) + return -EINVAL; + if (t == *ui) + return count; + *ui = (unsigned int)t; + return count; + } + *ui = (unsigned int)t; return count; @@ -942,6 +951,7 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, revoked_atomic_block, revoked_atomic_block) /* For block age extent cache */ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, hot_data_age_threshold, hot_data_age_threshold); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, warm_data_age_threshold, warm_data_age_threshold); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, last_age_weight, last_age_weight); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -1036,6 +1046,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(revoked_atomic_block), ATTR_LIST(hot_data_age_threshold), ATTR_LIST(warm_data_age_threshold), + ATTR_LIST(last_age_weight), NULL, }; From 2d8b1b00be872bef69b7e567d2ea778c20a11a04 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 31 Jan 2023 22:47:00 +0800 Subject: [PATCH 576/580] UPSTREAM: f2fs: fix to update age extent correctly during truncation nr_free may be less than len, we should update age extent cache w/ range [fofs, len] rather than [fofs, nr_free]. Fixes: dfcfcc6d16b9 ("f2fs: add block_age-based extent cache") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim (cherry picked from commit 8c0ed062ce27f6b7f0a568cb241e2b4dd2d9e6a6) Change-Id: I0a4546f84fd75db2f13857e4922520ab5885e845 Signed-off-by: zhaoyuenan --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 4aab3c0b68fc..b0b894ffa67e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -644,7 +644,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + ofs; f2fs_update_read_extent_cache_range(dn, fofs, 0, len); - f2fs_update_age_extent_cache_range(dn, fofs, nr_free); + f2fs_update_age_extent_cache_range(dn, fofs, len); dec_valid_block_count(sbi, dn->inode, nr_free); } dn->ofs_in_node = ofs; From b3608a1d37fa90005f76550887230698ef33e6b4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 31 Jan 2023 22:47:01 +0800 Subject: [PATCH 577/580] UPSTREAM: f2fs: fix to update age extent in f2fs_do_zero_range() We should update age extent in f2fs_do_zero_range() like we did in f2fs_truncate_data_blocks_range(). Fixes: dfcfcc6d16b9 ("f2fs: add block_age-based extent cache") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim (cherry picked from commit a84153f939808102dfa10904aa0f743e734a3e1d) Change-Id: Iab59f94f792dbab8da7ab597d5a219476370a646 Signed-off-by: zhaoyuenan --- fs/f2fs/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b0b894ffa67e..507ee970dd75 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1481,6 +1481,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, } f2fs_update_read_extent_cache_range(dn, start, 0, index - start); + f2fs_update_age_extent_cache_range(dn, start, index - start); return ret; } From 711041fe72866de8b41866708bf909c8b827513a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 3 Mar 2025 22:16:21 +0000 Subject: [PATCH 578/580] UPSTREAM: f2fs: set highest IO priority for checkpoint thread The checkpoint is the top priority thread which can stop all the filesystem operations. Let's make it RT priority. Reviewed-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim (cherry picked from commit 8a2d9f00d502e6ef68c6d52f0863856040ddd2db) Change-Id: I7ab430a7294fa9644512f0a32245646944446213 Signed-off-by: zhaoyuenan --- fs/f2fs/checkpoint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index ab8071f9ad49..905a5fe9ad4b 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -21,7 +21,7 @@ #include "iostat.h" #include -#define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) +#define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 3)) static struct kmem_cache *ino_entry_slab; struct kmem_cache *f2fs_inode_entry_slab; From cb1b12dc85a827869ed2d54fa72e7008f3291510 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Mon, 7 Aug 2023 20:03:57 -0700 Subject: [PATCH 579/580] sched/headers: Move 'struct sched_param' out of uapi, to work around glibc/musl breakage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both glibc and musl define 'struct sched_param' in sched.h, while kernel has it in uapi/linux/sched/types.h, making it cumbersome to use sched_getattr(2) or sched_setattr(2) from userspace. For example, something like this: #include #include struct sched_attr sa; will result in "error: redefinition of ‘struct sched_param’" (note the code doesn't need sched_param at all -- it needs struct sched_attr plus some stuff from sched.h). The situation is, glibc is not going to provide a wrapper for sched_{get,set}attr, thus the need to include linux/sched_types.h directly, which leads to the above problem. Thus, the userspace is left with a few sub-par choices when it wants to use e.g. sched_setattr(2), such as maintaining a copy of struct sched_attr definition, or using some other ugly tricks. OTOH, 'struct sched_param' is well known, defined in POSIX, and it won't be ever changed (as that would break backward compatibility). So, while 'struct sched_param' is indeed part of the kernel uapi, exposing it the way it's done now creates an issue, and hiding it (like this patch does) fixes that issue, hopefully without creating another one: common userspace software rely on libc headers, and as for "special" software (like libc), it looks like glibc and musl do not rely on kernel headers for 'struct sched_param' definition (but let's Cc their mailing lists in case it's otherwise). The alternative to this patch would be to move struct sched_attr to, say, linux/sched.h, or linux/sched/attr.h (the new file). Oh, and here is the previous attempt to fix the issue: https://lore.kernel.org/all/20200528135552.GA87103@google.com/ While I support Linus arguments, the issue is still here and needs to be fixed. [ mingo: Linus is right, this shouldn't be needed - but on the other hand I agree that this header is not really helpful to user-space as-is. So let's pretend that is only about sched_attr, and call this commit a workaround for user-space breakage that it in reality is ... Also, remove the Fixes tag. ] Change-Id: I3943f8f4a11a9007ccc392de3ece9e62841e8fcb Signed-off-by: Kir Kolyshkin Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20230808030357.1213829-1-kolyshkin@gmail.com --- include/linux/sched.h | 5 ++++- include/uapi/linux/sched/types.h | 4 ---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 590e9d0a57c9..ce5e105adb5e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -51,7 +51,6 @@ struct reclaim_state; struct capture_control; struct robust_list_head; struct sched_attr; -struct sched_param; struct seq_file; struct sighand_struct; struct signal_struct; @@ -366,6 +365,10 @@ enum uclamp_id { UCLAMP_CNT }; +struct sched_param { + int sched_priority; +}; + struct sched_info { #ifdef CONFIG_SCHED_INFO /* Cumulative counters: */ diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h index c852153ddb0d..f863e747ff5f 100644 --- a/include/uapi/linux/sched/types.h +++ b/include/uapi/linux/sched/types.h @@ -4,10 +4,6 @@ #include -struct sched_param { - int sched_priority; -}; - #define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */ #define SCHED_ATTR_SIZE_VER1 56 /* add: util_{min,max} */ From 7d6808d57e8ab843c93af44a4b01cfacb46c350a Mon Sep 17 00:00:00 2001 From: Tashfin Shakeer Rhythm Date: Sat, 9 Apr 2022 19:12:55 +0600 Subject: [PATCH 580/580] USB: composite: Re-allocate bytes for IN endpoint Switching to kzalloc doesn't mean we can't allocate some extra bytes for the TX buffer. This was simply an improper merge of commit [1] by CodeLinaro. The commit was just supposed to replace kmalloc with kzalloc. Therefore, add back the buffer allocation. [1]: ae9cf8b759c0 ("UPSTREAM: USB: gadget: zero allocate endpoint 0 buffers") Fixes: ae9cf8b759c0 ("UPSTREAM: USB: gadget: zero allocate endpoint 0 buffers") Change-Id: I97bcea8f9aed2d9a672b2f92b6cfd998f43b9d95 Signed-off-by: Tashfin Shakeer Rhythm --- drivers/usb/gadget/composite.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c index e0b09ea718f5..dffbdd4add9f 100644 --- a/drivers/usb/gadget/composite.c +++ b/drivers/usb/gadget/composite.c @@ -2340,7 +2340,8 @@ int composite_dev_prepare(struct usb_composite_driver *composite, if (!cdev->req) return -ENOMEM; - cdev->req->buf = kzalloc(USB_COMP_EP0_BUFSIZ, GFP_KERNEL); + cdev->req->buf = kzalloc(USB_COMP_EP0_BUFSIZ + + (gadget->extra_buf_alloc), GFP_KERNEL); if (!cdev->req->buf) goto fail;