ext2_get_group_desc関数では、渡されたinode番号inoを基にそのinodeが格納されているグループディスクリプタblock_group を探索する。 ext2ファイルシステムでは、super blockとblock group descriptorはBlock Group #0に保存され、他のBlock Groupには複製が保存される。
leava@ubuntu:/work/$ wget https://buildroot.uclibc.org/downloads/buildroot-2022.02.1.tar.gz
leava@ubuntu:/work/$ tar xf buildroot-2022.02.1.tar.gz
leava@ubuntu:/work/$ cd buildroot-2022.02.1
デフォルトのコンフィグを生成する
leava@ubuntu:/work/buildroot-2022.02.1/$ make qemu_aarch64_virt_defconfig
syzkallerに必要なコンフィグを修正する
Target options
Target Architecture - Aarch64 (little endian)
Toolchain
Toolchain type (External toolchain)
System Configuration
[*] Enable root login with password
(password) Root password
[*] Run a getty (login prompt) after boot
(ttyAMA0) TTY port
Target packages
[*] Show packages that are also provided by busybox
Networking applications
[*] dhcpcd
[*] iproute2
[*] openssh
Filesystem images
[*] ext2/3/4 root filesystem
ext2/3/4 variant (ext3)
(60M) exact size
[*] tar the root filesystem
buildrootでビルドする
leava@ubuntu:/work/buildroot-2022.02.1/$ make
成果物を確認する
leava@ubuntu:/work/buildroot-2022.02.1/$ ls -l output/images
total 50668
-rw-r--r-- 1 blue root 10883584 May 3 23:41 Image
-rw-r--r-- 1 blue root 62914560 May 4 09:12 rootfs.ext2
lrwxrwxrwx 1 blue root 11 May 3 23:41 rootfs.ext3 -> rootfs.ext2
-rw-r--r-- 1 blue root 16640000 May 3 23:41 rootfs.tar
-rwxr-xr-x 1 blue root 486 May 4 09:12 start-qemu.sh
leava@ubuntu:/work/$ wget https://mirrors.edge.kernel.org/pub/linux/kernel/v5.x/linux-5.15.tar.gz
leava@ubuntu:/work/$ tar xf linux-5.15.tar.gz
leava@ubuntu:/work/$ cd linux-5.15
デフォルトのコンフィグを生成する
leava@ubuntu:/work/linux-5.15/$ ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- make defconfig
leava@ubuntu:/work/syzkaller/$ CC=/work/gcc-arm-10.3-2021.07-x86_64-aarch64-none-linux-gnu/bin/aarch64-linux-gnu-g++
leava@ubuntu:/work/syzkaller/$ make TARGETARCH=arm64
// 2168:intwrite_cache_pages(struct address_space *mapping,
struct writeback_control *wbc, writepage_t writepage,
void *data)
{
int ret = 0;
int done = 0;
int error;
struct pagevec pvec;
int nr_pages;
pgoff_t index;
pgoff_t end; /* Inclusive */
pgoff_t done_index;
int range_whole = 0;
xa_mark_t tag;
pagevec_init(&pvec);
if (wbc->range_cyclic) {
index = mapping->writeback_index; /* prev offset */
end = -1;
} else {
index = wbc->range_start >> PAGE_SHIFT;
end = wbc->range_end >> PAGE_SHIFT;
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
}
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) {
tag_pages_for_writeback(mapping, index, end);
tag = PAGECACHE_TAG_TOWRITE;
} else {
tag = PAGECACHE_TAG_DIRTY;
}
done_index = index;
while (!done && (index <= end)) {
int i;
nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
tag);
if (nr_pages == 0)
break;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
done_index = page->index;
lock_page(page);
/* * Page truncated or invalidated. We can freely skip it * then, even for data integrity operations: the page * has disappeared concurrently, so there could be no * real expectation of this data interity operation * even if there is now a new, dirty page at the same * pagecache address. */if (unlikely(page->mapping != mapping)) {
continue_unlock:
unlock_page(page);
continue;
}
if (!PageDirty(page)) {
/* someone wrote it for us */goto continue_unlock;
}
if (PageWriteback(page)) {
if (wbc->sync_mode != WB_SYNC_NONE)
wait_on_page_writeback(page);
elsegoto continue_unlock;
}
BUG_ON(PageWriteback(page));
if (!clear_page_dirty_for_io(page))
goto continue_unlock;
trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
error = (*writepage)(page, wbc, data);
if (unlikely(error)) {
/* * Handle errors according to the type of * writeback. There's no need to continue for * background writeback. Just push done_index * past this page so media errors won't choke * writeout for the entire file. For integrity * writeback, we must process the entire dirty * set regardless of errors because the fs may * still have state to clear for each page. In * that case we continue processing and return * the first error. */if (error == AOP_WRITEPAGE_ACTIVATE) {
unlock_page(page);
error = 0;
} elseif (wbc->sync_mode != WB_SYNC_ALL) {
ret = error;
done_index = page->index + 1;
done = 1;
break;
}
if (!ret)
ret = error;
}
/* * We stop writing back only if we are not doing * integrity sync. In case of integrity sync we have to * keep going until we have written all the pages * we tagged for writeback prior to entering this loop. */if (--wbc->nr_to_write <= 0 &&
wbc->sync_mode == WB_SYNC_NONE) {
done = 1;
break;
}
}
pagevec_release(&pvec);
cond_resched();
}
/* * If we hit the last page and there is more work to be done: wrap * back the index back to the start of the file for the next * time we are called. */if (wbc->range_cyclic && !done)
done_index = 0;
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
mapping->writeback_index = done_index;
return ret;
}
EXPORT_SYMBOL(write_cache_pages);
// 480:staticint__mpage_writepage(struct page *page, struct writeback_control *wbc,
void *data)
{
struct mpage_data *mpd = data;
struct bio *bio = mpd->bio;
struct address_space *mapping = page->mapping;
struct inode *inode = page->mapping->host;
constunsigned blkbits = inode->i_blkbits;
unsignedlong end_index;
constunsigned blocks_per_page = PAGE_SIZE >> blkbits;
sector_t last_block;
sector_t block_in_file;
sector_t blocks[MAX_BUF_PER_PAGE];
unsigned page_block;
unsigned first_unmapped = blocks_per_page;
struct block_device *bdev = NULL;
int boundary = 0;
sector_t boundary_block = 0;
struct block_device *boundary_bdev = NULL;
int length;
struct buffer_head map_bh;
loff_t i_size = i_size_read(inode);
int ret = 0;
int op_flags = wbc_to_write_flags(wbc);
if (page_has_buffers(page)) {
struct buffer_head *head = page_buffers(page);
struct buffer_head *bh = head;
/* If they're all mapped and dirty, do it */
page_block = 0;
do {
BUG_ON(buffer_locked(bh));
if (!buffer_mapped(bh)) {
/* * unmapped dirty buffers are created by * __set_page_dirty_buffers -> mmapped data */if (buffer_dirty(bh))
goto confused;
if (first_unmapped == blocks_per_page)
first_unmapped = page_block;
continue;
}
if (first_unmapped != blocks_per_page)
goto confused; /* hole -> non-hole */if (!buffer_dirty(bh) || !buffer_uptodate(bh))
goto confused;
if (page_block) {
if (bh->b_blocknr != blocks[page_block-1] + 1)
goto confused;
}
blocks[page_block++] = bh->b_blocknr;
boundary = buffer_boundary(bh);
if (boundary) {
boundary_block = bh->b_blocknr;
boundary_bdev = bh->b_bdev;
}
bdev = bh->b_bdev;
} while ((bh = bh->b_this_page) != head);
if (first_unmapped)
goto page_is_mapped;
/* * Page has buffers, but they are all unmapped. The page was * created by pagein or read over a hole which was handled by * block_read_full_page(). If this address_space is also * using mpage_readahead then this can rarely happen. */goto confused;
}
/* * The page has no buffers: map it to disk */BUG_ON(!PageUptodate(page));
block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
last_block = (i_size - 1) >> blkbits;
map_bh.b_page = page;
for (page_block = 0; page_block < blocks_per_page; ) {
map_bh.b_state = 0;
map_bh.b_size = 1 << blkbits;
if (mpd->get_block(inode, block_in_file, &map_bh, 1))
goto confused;
if (buffer_new(&map_bh))
clean_bdev_bh_alias(&map_bh);
if (buffer_boundary(&map_bh)) {
boundary_block = map_bh.b_blocknr;
boundary_bdev = map_bh.b_bdev;
}
if (page_block) {
if (map_bh.b_blocknr != blocks[page_block-1] + 1)
goto confused;
}
blocks[page_block++] = map_bh.b_blocknr;
boundary = buffer_boundary(&map_bh);
bdev = map_bh.b_bdev;
if (block_in_file == last_block)
break;
block_in_file++;
}
BUG_ON(page_block == 0);
first_unmapped = page_block;
page_is_mapped:
end_index = i_size >> PAGE_SHIFT;
if (page->index >= end_index) {
/* * The page straddles i_size. It must be zeroed out on each * and every writepage invocation because it may be mmapped. * "A file is mapped in multiples of the page size. For a file * that is not a multiple of the page size, the remaining memory * is zeroed when mapped, and writes to that region are not * written out to the file." */unsigned offset = i_size & (PAGE_SIZE - 1);
if (page->index > end_index || !offset)
goto confused;
zero_user_segment(page, offset, PAGE_SIZE);
}
/* * This page will go to BIO. Do we need to send this BIO off first? */if (bio && mpd->last_block_in_bio != blocks[0] - 1)
bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);
alloc_new:
if (bio == NULL) {
if (first_unmapped == blocks_per_page) {
if (!bdev_write_page(bdev, blocks[0] << (blkbits - 9),
page, wbc))
goto out;
}
bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
BIO_MAX_PAGES, GFP_NOFS|__GFP_HIGH);
if (bio == NULL)
goto confused;
wbc_init_bio(wbc, bio);
bio->bi_write_hint = inode->i_write_hint;
}
/* * Must try to add the page before marking the buffer clean or * the confused fail path above (OOM) will be very confused when * it finds all bh marked clean (i.e. it will not write anything) */wbc_account_cgroup_owner(wbc, page, PAGE_SIZE);
length = first_unmapped << blkbits;
if (bio_add_page(bio, page, length, 0) < length) {
bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);
goto alloc_new;
}
clean_buffers(page, first_unmapped);
BUG_ON(PageWriteback(page));
set_page_writeback(page);
unlock_page(page);
if (boundary || (first_unmapped != blocks_per_page)) {
bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);
if (boundary_block) {
write_boundary_block(boundary_bdev,
boundary_block, 1 << blkbits);
}
} else {
mpd->last_block_in_bio = blocks[blocks_per_page - 1];
}
goto out;
confused:
if (bio)
bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);
if (mpd->use_writepage) {
ret = mapping->a_ops->writepage(page, wbc);
} else {
ret = -EAGAIN;
goto out;
}
/* * The caller has a ref on the inode, so *mapping is stable */mapping_set_error(mapping, ret);
out:
mpd->bio = bio;
return ret;
}
// 452:/* obsolete, don't use in new code */staticinlinevoidbio_set_op_attrs(struct bio *bio, unsigned op,
unsigned op_flags)
{
bio->bi_opf = op | op_flags;
}
今回のケースでは、op_flagsはwbc_to_write_flags関数によって更新される。
//97:staticinlineintwbc_to_write_flags(struct writeback_control *wbc)
{
int flags = 0;
if (wbc->punt_to_cgroup)
flags = REQ_CGROUP_PUNT;
if (wbc->sync_mode == WB_SYNC_ALL)
flags |= REQ_SYNC;
elseif (wbc->for_kupdate || wbc->for_background)
flags |= REQ_BACKGROUND;
return flags;
}
// 608:voidguard_bio_eod(struct bio *bio)
{
sector_t maxsector;
struct hd_struct *part;
rcu_read_lock();
part = __disk_get_part(bio->bi_disk, bio->bi_partno);
if (part)
maxsector = part_nr_sects_read(part);
else
maxsector = get_capacity(bio->bi_disk);
rcu_read_unlock();
if (!maxsector)
return;
/* * If the *whole* IO is past the end of the device, * let it through, and the IO layer will turn it into * an EIO. */if (unlikely(bio->bi_iter.bi_sector >= maxsector))
return;
maxsector -= bio->bi_iter.bi_sector;
if (likely((bio->bi_iter.bi_size >> 9) <= maxsector))
return;
bio_truncate(bio, maxsector << 9);
}
Usage: kvm-xfstests [<OPTIONS>] smoke|full
Usage: kvm-xfstests [<OPTIONS>] <test> ...
Usage: kvm-xfstests [<OPTIONS>] -g <group> ...
Usage: kvm-xfstests [<OPTIONS>] shell|maint
Usage: kvm-xfstests [<OPTIONS>] syz <repro>
Common options are:
-a - Disable auto-exclude; run all tests
-c config - Specify a file system configuration
-C count - Run the specified tests multiple times
-I image - Use this test appliance image
-m mountopts - Append mount options to fs config
-n nr_cpus - Specify the number of cpu's
-numa num - Ask KVM to create <num> NUMA nodes
-N - Enable networking (requires root)
-o opts - Extra kernel command line options
-O opts - Extra options for test runner
-r ram - Specify memory to be used in megabytes
-x group - Exclude group of tests from running
-X test - Exclude test from running
--kernel file - Boot the specified kernel
--initrd initrd - Boot with the specified initrd
--no-log - Don't save the log file for this run
--no-action - Print the command to start the VM
leava@ubuntu:~/work/$ kvm-xfstests shell
Debian GNU/Linux 11 kvm-xfstests ttyS0
kvm-xfstests login: root (automatic login)
Linux kvm-xfstests 5.15.0-xfstests #456 SMP Wed Feb 23 19:12:12 JST 2022 x86_64
The programs included with the Debian GNU/Linux system are free software;
the exact distribution terms for each program are described in the
individual files in /usr/share/doc/*/copyright.
Debian GNU/Linux comes with ABSOLUTELY NO WARRANTY, to the extent
permitted by applicable law.
Last login: Thu Feb 24 23:56:29 JST 2022 on ttyS1
COLUMNS=238
LINES=58
root@kvm-xfstests:~#
diff --git a/kvm-xfstests/config.common b/kvm-xfstests/config.commonindex 12cae0a..1de021f 100644--- a/kvm-xfstests/config.common+++ b/kvm-xfstests/config.common@@ -4,4 +4,4 @@
# Variables set here may be overridden in ~/.config/xfstests-common
#
-PRIMARY_FSTYPE="ext4"+PRIMARY_FSTYPE="xfs"
// 1116:voidwb_workfn(struct work_struct *work)
{
struct bdi_writeback *wb = container_of(to_delayed_work(work),
struct bdi_writeback, dwork);
long pages_written;
set_worker_desc("flush-%s", bdi_dev_name(wb->bdi));
current->flags |= PF_SWAPWRITE;
if (likely(!current_is_workqueue_rescuer() ||
!test_bit(WB_registered, &wb->state))) {
/* * The normal path. Keep writing back @wb until its * work_list is empty. Note that this path is also taken * if @wb is shutting down even when we're running off the * rescuer as work_list needs to be drained. */do {
pages_written = wb_do_writeback(wb);
trace_writeback_pages_written(pages_written);
} while (!list_empty(&wb->work_list));
} else {
/* * bdi_wq can't get enough workers and we're running off * the emergency worker. Don't hog it. Hopefully, 1024 is * enough for efficient IO. */
pages_written = writeback_inodes_wb(wb, 1024,
WB_REASON_FORKER_THREAD);
trace_writeback_pages_written(pages_written);
}
if (!list_empty(&wb->work_list))
wb_wakeup(wb);
elseif (wb_has_dirty_io(wb) && dirty_writeback_interval)
wb_wakeup_delayed(wb);
current->flags &= ~PF_SWAPWRITE;
}
// 1077:staticvoidwb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
{
if (!wb_has_dirty_io(wb))
return;
/* * All callers of this function want to start writeback of all * dirty pages. Places like vmscan can call this at a very * high frequency, causing pointless allocations of tons of * work items and keeping the flusher threads busy retrieving * that work. Ensure that we only allow one of them pending and * inflight at the time. */if (test_bit(WB_start_all, &wb->state) ||
test_and_set_bit(WB_start_all, &wb->state))
return;
wb->start_all_reason = reason;
wb_wakeup(wb);
}
// 1042:staticlongwb_writeback(struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
unsignedlong wb_start = jiffies;
long nr_pages = work->nr_pages;
unsignedlong dirtied_before = jiffies;
struct inode *inode;
long progress;
struct blk_plug plug;
blk_start_plug(&plug);
spin_lock(&wb->list_lock);
for (;;) {
/* * Stop writeback when nr_pages has been consumed */if (work->nr_pages <= 0)
break;
/* * Background writeout and kupdate-style writeback may * run forever. Stop them if there is other work to do * so that e.g. sync can proceed. They'll be restarted * after the other works are all done. */if ((work->for_background || work->for_kupdate) &&
!list_empty(&wb->work_list))
break;
/* * For background writeout, stop when we are below the * background dirty threshold */if (work->for_background && !wb_over_bg_thresh(wb))
break;
/* * Kupdate and background works are special and we want to * include all inodes that need writing. Livelock avoidance is * handled by these works yielding to any other work so we are * safe. */if (work->for_kupdate) {
dirtied_before = jiffies -
msecs_to_jiffies(dirty_expire_interval * 10);
} elseif (work->for_background)
dirtied_before = jiffies;
trace_writeback_start(wb, work);
if (list_empty(&wb->b_io))
queue_io(wb, work, dirtied_before);
if (work->sb)
progress = writeback_sb_inodes(work->sb, wb, work);
else
progress = __writeback_inodes_wb(wb, work);
trace_writeback_written(wb, work);
wb_update_bandwidth(wb, wb_start);
/* * Did we write something? Try for more * * Dirty inodes are moved to b_io for writeback in batches. * The completion of the current batch does not necessarily * mean the overall work is done. So we keep looping as long * as made some progress on cleaning pages or inodes. */if (progress)
continue;
/* * No more inodes for IO, bail */if (list_empty(&wb->b_more_io))
break;
/* * Nothing written. Wait for some inode to * become available for writeback. Otherwise * we'll just busyloop. */trace_writeback_wait(wb, work);
inode = wb_inode(wb->b_more_io.prev);
spin_lock(&inode->i_lock);
spin_unlock(&wb->list_lock);
/* This function drops i_lock... */inode_sleep_on_writeback(inode);
spin_lock(&wb->list_lock);
}
spin_unlock(&wb->list_lock);
blk_finish_plug(&plug);
return nr_pages - work->nr_pages;
}
// 1771:staticlong__writeback_inodes_wb(struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
unsignedlong start_time = jiffies;
long wrote = 0;
while (!list_empty(&wb->b_io)) {
struct inode *inode = wb_inode(wb->b_io.prev);
struct super_block *sb = inode->i_sb;
if (!trylock_super(sb)) {
/* * trylock_super() may fail consistently due to * s_umount being grabbed by someone else. Don't use * requeue_io() to avoid busy retrying the inode/sb. */redirty_tail(inode, wb);
continue;
}
wrote += writeback_sb_inodes(sb, wb, work);
up_read(&sb->s_umount);
/* refer to the same tests at the end of writeback_sb_inodes */if (wrote) {
if (time_is_before_jiffies(start_time + HZ / 10UL))
break;
if (work->nr_pages <= 0)
break;
}
}
/* Leave any unwritten inodes on b_io */return wrote;
}
// 1629:staticlongwriteback_sb_inodes(struct super_block *sb,
struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
struct writeback_control wbc = {
.sync_mode = work->sync_mode,
.tagged_writepages = work->tagged_writepages,
.for_kupdate = work->for_kupdate,
.for_background = work->for_background,
.for_sync = work->for_sync,
.range_cyclic = work->range_cyclic,
.range_start = 0,
.range_end = LLONG_MAX,
};
unsignedlong start_time = jiffies;
long write_chunk;
long wrote = 0; /* count both pages and inodes */while (!list_empty(&wb->b_io)) {
struct inode *inode = wb_inode(wb->b_io.prev);
struct bdi_writeback *tmp_wb;
if (inode->i_sb != sb) {
if (work->sb) {
/* * We only want to write back data for this * superblock, move all inodes not belonging * to it back onto the dirty list. */redirty_tail(inode, wb);
continue;
}
/* * The inode belongs to a different superblock. * Bounce back to the caller to unpin this and * pin the next superblock. */break;
}
/* * Don't bother with new inodes or inodes being freed, first * kind does not need periodic writeout yet, and for the latter * kind writeout is handled by the freer. */spin_lock(&inode->i_lock);
if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
redirty_tail_locked(inode, wb);
spin_unlock(&inode->i_lock);
continue;
}
if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
/* * If this inode is locked for writeback and we are not * doing writeback-for-data-integrity, move it to * b_more_io so that writeback can proceed with the * other inodes on s_io. * * We'll have another go at writing back this inode * when we completed a full scan of b_io. */spin_unlock(&inode->i_lock);
requeue_io(inode, wb);
trace_writeback_sb_inodes_requeue(inode);
continue;
}
spin_unlock(&wb->list_lock);
/* * We already requeued the inode if it had I_SYNC set and we * are doing WB_SYNC_NONE writeback. So this catches only the * WB_SYNC_ALL case. */if (inode->i_state & I_SYNC) {
/* Wait for I_SYNC. This function drops i_lock... */inode_sleep_on_writeback(inode);
/* Inode may be gone, start again */spin_lock(&wb->list_lock);
continue;
}
inode->i_state |= I_SYNC;
wbc_attach_and_unlock_inode(&wbc, inode);
write_chunk = writeback_chunk_size(wb, work);
wbc.nr_to_write = write_chunk;
wbc.pages_skipped = 0;
/* * We use I_SYNC to pin the inode in memory. While it is set * evict_inode() will wait so the inode cannot be freed. */__writeback_single_inode(inode, &wbc);
wbc_detach_inode(&wbc);
work->nr_pages -= write_chunk - wbc.nr_to_write;
wrote += write_chunk - wbc.nr_to_write;
if (need_resched()) {
/* * We're trying to balance between building up a nice * long list of IOs to improve our merge rate, and * getting those IOs out quickly for anyone throttling * in balance_dirty_pages(). cond_resched() doesn't * unplug, so get our IOs out the door before we * give up the CPU. */blk_flush_plug(current);
cond_resched();
}
/* * Requeue @inode if still dirty. Be careful as @inode may * have been switched to another wb in the meantime. */
tmp_wb = inode_to_wb_and_lock_list(inode);
spin_lock(&inode->i_lock);
if (!(inode->i_state & I_DIRTY_ALL))
wrote++;
requeue_inode(inode, tmp_wb, &wbc);
inode_sync_complete(inode);
spin_unlock(&inode->i_lock);
if (unlikely(tmp_wb != wb)) {
spin_unlock(&tmp_wb->list_lock);
spin_lock(&wb->list_lock);
}
/* * bail out to wb_writeback() often enough to check * background threshold and other termination conditions. */if (wrote) {
if (time_is_before_jiffies(start_time + HZ / 10UL))
break;
if (work->nr_pages <= 0)
break;
}
}
return wrote;
}
// 1449:staticint__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
{
struct address_space *mapping = inode->i_mapping;
long nr_to_write = wbc->nr_to_write;
unsigned dirty;
int ret;
WARN_ON(!(inode->i_state & I_SYNC));
trace_writeback_single_inode_start(inode, wbc, nr_to_write);
ret = do_writepages(mapping, wbc);
/* * Make sure to wait on the data before writing out the metadata. * This is important for filesystems that modify metadata on data * I/O completion. We don't do it for sync(2) writeback because it has a * separate, external IO completion path and ->sync_fs for guaranteeing * inode metadata is written back correctly. */if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
int err = filemap_fdatawait(mapping);
if (ret == 0)
ret = err;
}
/* * Some filesystems may redirty the inode during the writeback * due to delalloc, clear dirty metadata flags right before * write_inode() */spin_lock(&inode->i_lock);
dirty = inode->i_state & I_DIRTY;
if ((inode->i_state & I_DIRTY_TIME) &&
((dirty & I_DIRTY_INODE) ||
wbc->sync_mode == WB_SYNC_ALL || wbc->for_sync ||
time_after(jiffies, inode->dirtied_time_when +
dirtytime_expire_interval * HZ))) {
dirty |= I_DIRTY_TIME;
trace_writeback_lazytime(inode);
}
inode->i_state &= ~dirty;
/* * Paired with smp_mb() in __mark_inode_dirty(). This allows * __mark_inode_dirty() to test i_state without grabbing i_lock - * either they see the I_DIRTY bits cleared or we see the dirtied * inode. * * I_DIRTY_PAGES is always cleared together above even if @mapping * still has dirty pages. The flag is reinstated after smp_mb() if * necessary. This guarantees that either __mark_inode_dirty() * sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY. */smp_mb();
if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
inode->i_state |= I_DIRTY_PAGES;
spin_unlock(&inode->i_lock);
if (dirty & I_DIRTY_TIME)
mark_inode_dirty_sync(inode);
/* Don't write the inode if only I_DIRTY_PAGES was set */if (dirty & ~I_DIRTY_PAGES) {
int err = write_inode(inode, wbc);
if (ret == 0)
ret = err;
}
trace_writeback_single_inode(inode, wbc, nr_to_write);
return ret;
}