fs/xfs/xfs_zone_gc.c - linux - Git at Google

 // SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (c) 2023-2025 Christoph Hellwig.
  * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
  */
 #include "xfs.h"
 #include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
 #include "xfs_trans.h"
 #include "xfs_icache.h"
 #include "xfs_rmap.h"
 #include "xfs_rtbitmap.h"
 #include "xfs_rtrmap_btree.h"
 #include "xfs_zone_alloc.h"
 #include "xfs_zone_priv.h"
 #include "xfs_zones.h"
 #include "xfs_trace.h"

 /*
  * Implement Garbage Collection (GC) of partially used zoned.
  *
  * To support the purely sequential writes in each zone, zoned XFS needs to be
  * able to move data remaining in a zone out of it to reset the zone to prepare
  * for writing to it again.
  *
  * This is done by the GC thread implemented in this file.  To support that a
  * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to
  * write the garbage collected data into.
  *
  * Whenever the available space is below the chosen threshold, the GC thread
  * looks for potential non-empty but not fully used zones that are worth
  * reclaiming.  Once found the rmap for the victim zone is queried, and after
  * a bit of sorting to reduce fragmentation, the still live extents are read
  * into memory and written to the GC target zone, and the bmap btree of the
  * files is updated to point to the new location.  To avoid taking the IOLOCK
  * and MMAPLOCK for the entire GC process and thus affecting the latency of
  * user reads and writes to the files, the GC writes are speculative and the
  * I/O completion checks that no other writes happened for the affected regions
  * before remapping.
  *
  * Once a zone does not contain any valid data, be that through GC or user
  * block removal, it is queued for for a zone reset.  The reset operation
  * carefully ensures that the RT device cache is flushed and all transactions
  * referencing the rmap have been committed to disk.
  */

 /*
  * Size of each GC scratch pad.  This is also the upper bound for each
  * GC I/O, which helps to keep latency down.
  */
 #define XFS_GC_CHUNK_SIZE	SZ_1M

 /*
  * Scratchpad data to read GCed data into.
  *
  * The offset member tracks where the next allocation starts, and freed tracks
  * the amount of space that is not used anymore.
  */
 #define XFS_ZONE_GC_NR_SCRATCH	2
 struct xfs_zone_scratch {
 	struct folio			*folio;
 	unsigned int			offset;
 	unsigned int			freed;
 };

 /*
  * Chunk that is read and written for each GC operation.
  *
  * Note that for writes to actual zoned devices, the chunk can be split when
  * reaching the hardware limit.
  */
 struct xfs_gc_bio {
 	struct xfs_zone_gc_data		*data;

 	/*
 	 * Entry into the reading/writing/resetting list.  Only accessed from
 	 * the GC thread, so no locking needed.
 	 */
 	struct list_head		entry;

 	/*
 	 * State of this gc_bio.  Done means the current I/O completed.
 	 * Set from the bio end I/O handler, read from the GC thread.
 	 */
 	enum {
 		XFS_GC_BIO_NEW,
 		XFS_GC_BIO_DONE,
 	} state;

 	/*
 	 * Pointer to the inode and byte range in the inode that this
 	 * GC chunk is operating on.
 	 */
 	struct xfs_inode		*ip;
 	loff_t				offset;
 	unsigned int			len;

 	/*
 	 * Existing startblock (in the zone to be freed) and newly assigned
 	 * daddr in the zone GCed into.
 	 */
 	xfs_fsblock_t			old_startblock;
 	xfs_daddr_t			new_daddr;
 	struct xfs_zone_scratch		*scratch;

 	/* Are we writing to a sequential write required zone? */
 	bool				is_seq;

 	/* Open Zone being written to */
 	struct xfs_open_zone		*oz;

 	/* Bio used for reads and writes, including the bvec used by it */
 	struct bio_vec			bv;
 	struct bio			bio;	/* must be last */
 };

 #define XFS_ZONE_GC_RECS		1024

 /* iterator, needs to be reinitialized for each victim zone */
 struct xfs_zone_gc_iter {
 	struct xfs_rtgroup		*victim_rtg;
 	unsigned int			rec_count;
 	unsigned int			rec_idx;
 	xfs_agblock_t			next_startblock;
 	struct xfs_rmap_irec		*recs;
 };

 /*
  * Per-mount GC state.
  */
 struct xfs_zone_gc_data {
 	struct xfs_mount		*mp;

 	/* bioset used to allocate the gc_bios */
 	struct bio_set			bio_set;

 	/*
 	 * Scratchpad used, and index to indicated which one is used.
 	 */
 	struct xfs_zone_scratch		scratch[XFS_ZONE_GC_NR_SCRATCH];
 	unsigned int			scratch_idx;

 	/*
 	 * List of bios currently being read, written and reset.
 	 * These lists are only accessed by the GC thread itself, and must only
 	 * be processed in order.
 	 */
 	struct list_head		reading;
 	struct list_head		writing;
 	struct list_head		resetting;

 	/*
 	 * Iterator for the victim zone.
 	 */
 	struct xfs_zone_gc_iter		iter;
 };

 /*
  * We aim to keep enough zones free in stock to fully use the open zone limit
  * for data placement purposes. Additionally, the m_zonegc_low_space tunable
  * can be set to make sure a fraction of the unused blocks are available for
  * writing.
  */
 bool
 xfs_zoned_need_gc(
 	struct xfs_mount	*mp)
 {
 	s64			available, free, threshold;
 	s32			remainder;

 	if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
 		return false;

 	available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE);

 	if (available <
 	    mp->m_groups[XG_TYPE_RTG].blocks *
 	    (mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
 		return true;

 	free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS);

 	threshold = div_s64_rem(free, 100, &remainder);
 	threshold = threshold * mp->m_zonegc_low_space +
 		    remainder * div_s64(mp->m_zonegc_low_space, 100);

 	if (available < threshold)
 		return true;

 	return false;
 }

 static struct xfs_zone_gc_data *
 xfs_zone_gc_data_alloc(
 	struct xfs_mount	*mp)
 {
 	struct xfs_zone_gc_data	*data;
 	int			i;

 	data = kzalloc(sizeof(*data), GFP_KERNEL);
 	if (!data)
 		return NULL;
 	data->iter.recs = kcalloc(XFS_ZONE_GC_RECS, sizeof(*data->iter.recs),
 			GFP_KERNEL);
 	if (!data->iter.recs)
 		goto out_free_data;

 	/*
 	 * We actually only need a single bio_vec.  It would be nice to have
 	 * a flag that only allocates the inline bvecs and not the separate
 	 * bvec pool.
 	 */
 	if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
 			BIOSET_NEED_BVECS))
 		goto out_free_recs;
 	for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) {
 		data->scratch[i].folio =
 			folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE));
 		if (!data->scratch[i].folio)
 			goto out_free_scratch;
 	}
 	INIT_LIST_HEAD(&data->reading);
 	INIT_LIST_HEAD(&data->writing);
 	INIT_LIST_HEAD(&data->resetting);
 	data->mp = mp;
 	return data;

 out_free_scratch:
 	while (--i >= 0)
 		folio_put(data->scratch[i].folio);
 	bioset_exit(&data->bio_set);
 out_free_recs:
 	kfree(data->iter.recs);
 out_free_data:
 	kfree(data);
 	return NULL;
 }

 static void
 xfs_zone_gc_data_free(
 	struct xfs_zone_gc_data	*data)
 {
 	int			i;

 	for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++)
 		folio_put(data->scratch[i].folio);
 	bioset_exit(&data->bio_set);
 	kfree(data->iter.recs);
 	kfree(data);
 }

 static void
 xfs_zone_gc_iter_init(
 	struct xfs_zone_gc_iter	*iter,
 	struct xfs_rtgroup	*victim_rtg)

 {
 	iter->next_startblock = 0;
 	iter->rec_count = 0;
 	iter->rec_idx = 0;
 	iter->victim_rtg = victim_rtg;
 }

 /*
  * Query the rmap of the victim zone to gather the records to evacuate.
  */
 static int
 xfs_zone_gc_query_cb(
 	struct xfs_btree_cur	*cur,
 	const struct xfs_rmap_irec *irec,
 	void			*private)
 {
 	struct xfs_zone_gc_iter	*iter = private;

 	ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner));
 	ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner));
 	ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)));

 	iter->recs[iter->rec_count] = *irec;
 	if (++iter->rec_count == XFS_ZONE_GC_RECS) {
 		iter->next_startblock =
 			irec->rm_startblock + irec->rm_blockcount;
 		return 1;
 	}
 	return 0;
 }

 static int
 xfs_zone_gc_rmap_rec_cmp(
 	const void			*a,
 	const void			*b)
 {
 	const struct xfs_rmap_irec	*reca = a;
 	const struct xfs_rmap_irec	*recb = b;
 	int				diff;

 	diff = cmp_int(reca->rm_owner, recb->rm_owner);
 	if (diff)
 		return diff;
 	return cmp_int(reca->rm_offset, recb->rm_offset);
 }

 static int
 xfs_zone_gc_query(
 	struct xfs_mount	*mp,
 	struct xfs_zone_gc_iter	*iter)
 {
 	struct xfs_rtgroup	*rtg = iter->victim_rtg;
 	struct xfs_rmap_irec	ri_low = { };
 	struct xfs_rmap_irec	ri_high;
 	struct xfs_btree_cur	*cur;
 	struct xfs_trans	*tp;
 	int			error;

 	ASSERT(iter->next_startblock <= rtg_blocks(rtg));
 	if (iter->next_startblock == rtg_blocks(rtg))
 		goto done;

 	ASSERT(iter->next_startblock < rtg_blocks(rtg));
 	ri_low.rm_startblock = iter->next_startblock;
 	memset(&ri_high, 0xFF, sizeof(ri_high));

 	iter->rec_idx = 0;
 	iter->rec_count = 0;

 	tp = xfs_trans_alloc_empty(mp);
 	xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
 	cur = xfs_rtrmapbt_init_cursor(tp, rtg);
 	error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
 			xfs_zone_gc_query_cb, iter);
 	xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
 	xfs_btree_del_cursor(cur, error < 0 ? error : 0);
 	xfs_trans_cancel(tp);

 	if (error < 0)
 		return error;

 	/*
 	 * Sort the rmap records by inode number and increasing offset to
 	 * defragment the mappings.
 	 *
 	 * This could be further enhanced by an even bigger look ahead window,
 	 * but that's better left until we have better detection of changes to
 	 * inode mapping to avoid the potential of GCing already dead data.
 	 */
 	sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]),
 			xfs_zone_gc_rmap_rec_cmp, NULL);

 	if (error == 0) {
 		/*
 		 * We finished iterating through the zone.
 		 */
 		iter->next_startblock = rtg_blocks(rtg);
 		if (iter->rec_count == 0)
 			goto done;
 	}

 	return 0;
 done:
 	xfs_rtgroup_rele(iter->victim_rtg);
 	iter->victim_rtg = NULL;
 	return 0;
 }

 static bool
 xfs_zone_gc_iter_next(
 	struct xfs_mount	*mp,
 	struct xfs_zone_gc_iter	*iter,
 	struct xfs_rmap_irec	*chunk_rec,
 	struct xfs_inode	**ipp)
 {
 	struct xfs_rmap_irec	*irec;
 	int			error;

 	if (!iter->victim_rtg)
 		return false;

 retry:
 	if (iter->rec_idx == iter->rec_count) {
 		error = xfs_zone_gc_query(mp, iter);
 		if (error)
 			goto fail;
 		if (!iter->victim_rtg)
 			return false;
 	}

 	irec = &iter->recs[iter->rec_idx];
 	error = xfs_iget(mp, NULL, irec->rm_owner,
 			XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp);
 	if (error) {
 		/*
 		 * If the inode was already deleted, skip over it.
 		 */
 		if (error == -ENOENT) {
 			iter->rec_idx++;
 			goto retry;
 		}
 		goto fail;
 	}

 	if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) {
 		iter->rec_idx++;
 		xfs_irele(*ipp);
 		goto retry;
 	}

 	*chunk_rec = *irec;
 	return true;

 fail:
 	xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
 	return false;
 }

 static void
 xfs_zone_gc_iter_advance(
 	struct xfs_zone_gc_iter	*iter,
 	xfs_extlen_t		count_fsb)
 {
 	struct xfs_rmap_irec	*irec = &iter->recs[iter->rec_idx];

 	irec->rm_offset += count_fsb;
 	irec->rm_startblock += count_fsb;
 	irec->rm_blockcount -= count_fsb;
 	if (!irec->rm_blockcount)
 		iter->rec_idx++;
 }

 static struct xfs_rtgroup *
 xfs_zone_gc_pick_victim_from(
 	struct xfs_mount	*mp,
 	uint32_t		bucket)
 {
 	struct xfs_zone_info	*zi = mp->m_zone_info;
 	uint32_t		victim_used = U32_MAX;
 	struct xfs_rtgroup	*victim_rtg = NULL;
 	uint32_t		bit;

 	if (!zi->zi_used_bucket_entries[bucket])
 		return NULL;

 	for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket],
 			mp->m_sb.sb_rgcount) {
 		struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit);

 		if (!rtg)
 			continue;

 		/* skip zones that are just waiting for a reset */
 		if (rtg_rmap(rtg)->i_used_blocks == 0 ||
 		    rtg_rmap(rtg)->i_used_blocks >= victim_used) {
 			xfs_rtgroup_rele(rtg);
 			continue;
 		}

 		if (victim_rtg)
 			xfs_rtgroup_rele(victim_rtg);
 		victim_rtg = rtg;
 		victim_used = rtg_rmap(rtg)->i_used_blocks;

 		/*
 		 * Any zone that is less than 1 percent used is fair game for
 		 * instant reclaim. All of these zones are in the last
 		 * bucket, so avoid the expensive division for the zones
 		 * in the other buckets.
 		 */
 		if (bucket == 0 &&
 		    rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100)
 			break;
 	}

 	return victim_rtg;
 }

 /*
  * Iterate through all zones marked as reclaimable and find a candidate to
  * reclaim.
  */
 static bool
 xfs_zone_gc_select_victim(
 	struct xfs_zone_gc_data	*data)
 {
 	struct xfs_zone_gc_iter	*iter = &data->iter;
 	struct xfs_mount	*mp = data->mp;
 	struct xfs_zone_info	*zi = mp->m_zone_info;
 	struct xfs_rtgroup	*victim_rtg = NULL;
 	unsigned int		bucket;

 	if (xfs_is_shutdown(mp))
 		return false;

 	if (iter->victim_rtg)
 		return true;

 	/*
 	 * Don't start new work if we are asked to stop or park.
 	 */
 	if (kthread_should_stop() || kthread_should_park())
 		return false;

 	if (!xfs_zoned_need_gc(mp))
 		return false;

 	spin_lock(&zi->zi_used_buckets_lock);
 	for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) {
 		victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket);
 		if (victim_rtg)
 			break;
 	}
 	spin_unlock(&zi->zi_used_buckets_lock);

 	if (!victim_rtg)
 		return false;

 	trace_xfs_zone_gc_select_victim(victim_rtg, bucket);
 	xfs_zone_gc_iter_init(iter, victim_rtg);
 	return true;
 }

 static struct xfs_open_zone *
 xfs_zone_gc_steal_open(
 	struct xfs_zone_info	*zi)
 {
 	struct xfs_open_zone	*oz, *found = NULL;

 	spin_lock(&zi->zi_open_zones_lock);
 	list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) {
 		if (!found || oz->oz_allocated < found->oz_allocated)
 			found = oz;
 	}

 	if (found) {
 		found->oz_is_gc = true;
 		list_del_init(&found->oz_entry);
 		zi->zi_nr_open_zones--;
 	}

 	spin_unlock(&zi->zi_open_zones_lock);
 	return found;
 }

 static struct xfs_open_zone *
 xfs_zone_gc_select_target(
 	struct xfs_mount	*mp)
 {
 	struct xfs_zone_info	*zi = mp->m_zone_info;
 	struct xfs_open_zone	*oz = zi->zi_open_gc_zone;

 	/*
 	 * We need to wait for pending writes to finish.
 	 */
 	if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg))
 		return NULL;

 	ASSERT(zi->zi_nr_open_zones <=
 		mp->m_max_open_zones - XFS_OPEN_GC_ZONES);
 	oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
 	if (oz)
 		trace_xfs_zone_gc_target_opened(oz->oz_rtg);
 	spin_lock(&zi->zi_open_zones_lock);
 	zi->zi_open_gc_zone = oz;
 	spin_unlock(&zi->zi_open_zones_lock);
 	return oz;
 }

 /*
  * Ensure we have a valid open zone to write the GC data to.
  *
  * If the current target zone has space keep writing to it, else first wait for
  * all pending writes and then pick a new one.
  */
 static struct xfs_open_zone *
 xfs_zone_gc_ensure_target(
 	struct xfs_mount	*mp)
 {
 	struct xfs_open_zone	*oz = mp->m_zone_info->zi_open_gc_zone;

 	if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg))
 		return xfs_zone_gc_select_target(mp);
 	return oz;
 }

 static unsigned int
 xfs_zone_gc_scratch_available(
 	struct xfs_zone_gc_data	*data)
 {
 	return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset;
 }

 static bool
 xfs_zone_gc_space_available(
 	struct xfs_zone_gc_data	*data)
 {
 	struct xfs_open_zone	*oz;

 	oz = xfs_zone_gc_ensure_target(data->mp);
 	if (!oz)
 		return false;
 	return oz->oz_allocated < rtg_blocks(oz->oz_rtg) &&
 		xfs_zone_gc_scratch_available(data);
 }

 static void
 xfs_zone_gc_end_io(
 	struct bio		*bio)
 {
 	struct xfs_gc_bio	*chunk =
 		container_of(bio, struct xfs_gc_bio, bio);
 	struct xfs_zone_gc_data	*data = chunk->data;

 	WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE);
 	wake_up_process(data->mp->m_zone_info->zi_gc_thread);
 }

 static struct xfs_open_zone *
 xfs_zone_gc_alloc_blocks(
 	struct xfs_zone_gc_data	*data,
 	xfs_extlen_t		*count_fsb,
 	xfs_daddr_t		*daddr,
 	bool			*is_seq)
 {
 	struct xfs_mount	*mp = data->mp;
 	struct xfs_open_zone	*oz;

 	oz = xfs_zone_gc_ensure_target(mp);
 	if (!oz)
 		return NULL;

 	*count_fsb = min(*count_fsb,
 		XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data)));

 	/*
 	 * Directly allocate GC blocks from the reserved pool.
 	 *
 	 * If we'd take them from the normal pool we could be stealing blocks
 	 * from a regular writer, which would then have to wait for GC and
 	 * deadlock.
 	 */
 	spin_lock(&mp->m_sb_lock);
 	*count_fsb = min(*count_fsb,
 			rtg_blocks(oz->oz_rtg) - oz->oz_allocated);
 	*count_fsb = min3(*count_fsb,
 			mp->m_free[XC_FREE_RTEXTENTS].res_avail,
 			mp->m_free[XC_FREE_RTAVAILABLE].res_avail);
 	mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb;
 	mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb;
 	spin_unlock(&mp->m_sb_lock);

 	if (!*count_fsb)
 		return NULL;

 	*daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0);
 	*is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr);
 	if (!*is_seq)
 		*daddr += XFS_FSB_TO_BB(mp, oz->oz_allocated);
 	oz->oz_allocated += *count_fsb;
 	atomic_inc(&oz->oz_ref);
 	return oz;
 }

 static bool
 xfs_zone_gc_start_chunk(
 	struct xfs_zone_gc_data	*data)
 {
 	struct xfs_zone_gc_iter	*iter = &data->iter;
 	struct xfs_mount	*mp = data->mp;
 	struct block_device	*bdev = mp->m_rtdev_targp->bt_bdev;
 	struct xfs_open_zone	*oz;
 	struct xfs_rmap_irec	irec;
 	struct xfs_gc_bio	*chunk;
 	struct xfs_inode	*ip;
 	struct bio		*bio;
 	xfs_daddr_t		daddr;
 	bool			is_seq;

 	if (xfs_is_shutdown(mp))
 		return false;

 	if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip))
 		return false;
 	oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr,
 			&is_seq);
 	if (!oz) {
 		xfs_irele(ip);
 		return false;
 	}

 	bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set);

 	chunk = container_of(bio, struct xfs_gc_bio, bio);
 	chunk->ip = ip;
 	chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
 	chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
 	chunk->old_startblock =
 		xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
 	chunk->new_daddr = daddr;
 	chunk->is_seq = is_seq;
 	chunk->scratch = &data->scratch[data->scratch_idx];
 	chunk->data = data;
 	chunk->oz = oz;

 	bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
 	bio->bi_end_io = xfs_zone_gc_end_io;
 	bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
 			chunk->scratch->offset);
 	chunk->scratch->offset += chunk->len;
 	if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) {
 		data->scratch_idx =
 			(data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH;
 	}
 	WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
 	list_add_tail(&chunk->entry, &data->reading);
 	xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);

 	submit_bio(bio);
 	return true;
 }

 static void
 xfs_zone_gc_free_chunk(
 	struct xfs_gc_bio	*chunk)
 {
 	list_del(&chunk->entry);
 	xfs_open_zone_put(chunk->oz);
 	xfs_irele(chunk->ip);
 	bio_put(&chunk->bio);
 }

 static void
 xfs_zone_gc_submit_write(
 	struct xfs_zone_gc_data	*data,
 	struct xfs_gc_bio	*chunk)
 {
 	if (chunk->is_seq) {
 		chunk->bio.bi_opf &= ~REQ_OP_WRITE;
 		chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND;
 	}
 	chunk->bio.bi_iter.bi_sector = chunk->new_daddr;
 	chunk->bio.bi_end_io = xfs_zone_gc_end_io;
 	submit_bio(&chunk->bio);
 }

 static struct xfs_gc_bio *
 xfs_zone_gc_split_write(
 	struct xfs_zone_gc_data	*data,
 	struct xfs_gc_bio	*chunk)
 {
 	struct queue_limits	*lim =
 		&bdev_get_queue(chunk->bio.bi_bdev)->limits;
 	struct xfs_gc_bio	*split_chunk;
 	int			split_sectors;
 	unsigned int		split_len;
 	struct bio		*split;
 	unsigned int		nsegs;

 	if (!chunk->is_seq)
 		return NULL;

 	split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs,
 			lim->max_zone_append_sectors << SECTOR_SHIFT);
 	if (!split_sectors)
 		return NULL;

 	/* ensure the split chunk is still block size aligned */
 	split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT,
 			data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT;
 	split_len = split_sectors << SECTOR_SHIFT;

 	split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set);
 	split_chunk = container_of(split, struct xfs_gc_bio, bio);
 	split_chunk->data = data;
 	ihold(VFS_I(chunk->ip));
 	split_chunk->ip = chunk->ip;
 	split_chunk->is_seq = chunk->is_seq;
 	split_chunk->scratch = chunk->scratch;
 	split_chunk->offset = chunk->offset;
 	split_chunk->len = split_len;
 	split_chunk->old_startblock = chunk->old_startblock;
 	split_chunk->new_daddr = chunk->new_daddr;
 	split_chunk->oz = chunk->oz;
 	atomic_inc(&chunk->oz->oz_ref);

 	chunk->offset += split_len;
 	chunk->len -= split_len;
 	chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len);

 	/* add right before the original chunk */
 	WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW);
 	list_add_tail(&split_chunk->entry, &chunk->entry);
 	return split_chunk;
 }

 static void
 xfs_zone_gc_write_chunk(
 	struct xfs_gc_bio	*chunk)
 {
 	struct xfs_zone_gc_data	*data = chunk->data;
 	struct xfs_mount	*mp = chunk->ip->i_mount;
 	phys_addr_t		bvec_paddr =
 		bvec_phys(bio_first_bvec_all(&chunk->bio));
 	struct xfs_gc_bio	*split_chunk;

 	if (chunk->bio.bi_status)
 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
 	if (xfs_is_shutdown(mp)) {
 		xfs_zone_gc_free_chunk(chunk);
 		return;
 	}

 	WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
 	list_move_tail(&chunk->entry, &data->writing);

 	bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE);
 	bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len,
 			offset_in_folio(chunk->scratch->folio, bvec_paddr));

 	while ((split_chunk = xfs_zone_gc_split_write(data, chunk)))
 		xfs_zone_gc_submit_write(data, split_chunk);
 	xfs_zone_gc_submit_write(data, chunk);
 }

 static void
 xfs_zone_gc_finish_chunk(
 	struct xfs_gc_bio	*chunk)
 {
 	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
 	struct xfs_inode	*ip = chunk->ip;
 	struct xfs_mount	*mp = ip->i_mount;
 	int			error;

 	if (chunk->bio.bi_status)
 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
 	if (xfs_is_shutdown(mp)) {
 		xfs_zone_gc_free_chunk(chunk);
 		return;
 	}

 	chunk->scratch->freed += chunk->len;
 	if (chunk->scratch->freed == chunk->scratch->offset) {
 		chunk->scratch->offset = 0;
 		chunk->scratch->freed = 0;
 	}

 	/*
 	 * Cycle through the iolock and wait for direct I/O and layouts to
 	 * ensure no one is reading from the old mapping before it goes away.
 	 *
 	 * Note that xfs_zoned_end_io() below checks that no other writer raced
 	 * with us to update the mapping by checking that the old startblock
 	 * didn't change.
 	 */
 	xfs_ilock(ip, iolock);
 	error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP);
 	if (!error)
 		inode_dio_wait(VFS_I(ip));
 	xfs_iunlock(ip, iolock);
 	if (error)
 		goto free;

 	if (chunk->is_seq)
 		chunk->new_daddr = chunk->bio.bi_iter.bi_sector;
 	error = xfs_zoned_end_io(ip, chunk->offset, chunk->len,
 			chunk->new_daddr, chunk->oz, chunk->old_startblock);
 free:
 	if (error)
 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
 	xfs_zone_gc_free_chunk(chunk);
 }

 static void
 xfs_zone_gc_finish_reset(
 	struct xfs_gc_bio	*chunk)
 {
 	struct xfs_rtgroup	*rtg = chunk->bio.bi_private;
 	struct xfs_mount	*mp = rtg_mount(rtg);
 	struct xfs_zone_info	*zi = mp->m_zone_info;

 	if (chunk->bio.bi_status) {
 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
 		goto out;
 	}

 	xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE);
 	atomic_inc(&zi->zi_nr_free_zones);

 	xfs_zoned_add_available(mp, rtg_blocks(rtg));

 	wake_up_all(&zi->zi_zone_wait);
 out:
 	list_del(&chunk->entry);
 	bio_put(&chunk->bio);
 }

 static bool
 xfs_zone_gc_prepare_reset(
 	struct bio		*bio,
 	struct xfs_rtgroup	*rtg)
 {
 	trace_xfs_zone_reset(rtg);

 	ASSERT(rtg_rmap(rtg)->i_used_blocks == 0);
 	bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
 	if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
 		if (!bdev_max_discard_sectors(bio->bi_bdev))
 			return false;
 		bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC;
 		bio->bi_iter.bi_size =
 			XFS_FSB_TO_B(rtg_mount(rtg), rtg_blocks(rtg));
 	}

 	return true;
 }

 int
 xfs_zone_gc_reset_sync(
 	struct xfs_rtgroup	*rtg)
 {
 	int			error = 0;
 	struct bio		bio;

 	bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0,
 			REQ_OP_ZONE_RESET);
 	if (xfs_zone_gc_prepare_reset(&bio, rtg))
 		error = submit_bio_wait(&bio);
 	bio_uninit(&bio);

 	return error;
 }

 static void
 xfs_zone_gc_reset_zones(
 	struct xfs_zone_gc_data	*data,
 	struct xfs_group	*reset_list)
 {
 	struct xfs_group	*next = reset_list;

 	if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) {
 		xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR);
 		return;
 	}

 	do {
 		struct xfs_rtgroup	*rtg = to_rtg(next);
 		struct xfs_gc_bio	*chunk;
 		struct bio		*bio;

 		xfs_log_force_inode(rtg_rmap(rtg));

 		next = rtg_group(rtg)->xg_next_reset;
 		rtg_group(rtg)->xg_next_reset = NULL;

 		bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev,
 				0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set);
 		bio->bi_private = rtg;
 		bio->bi_end_io = xfs_zone_gc_end_io;

 		chunk = container_of(bio, struct xfs_gc_bio, bio);
 		chunk->data = data;
 		WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
 		list_add_tail(&chunk->entry, &data->resetting);

 		/*
 		 * Also use the bio to drive the state machine when neither
 		 * zone reset nor discard is supported to keep things simple.
 		 */
 		if (xfs_zone_gc_prepare_reset(bio, rtg))
 			submit_bio(bio);
 		else
 			bio_endio(bio);
 	} while (next);
 }

 /*
  * Handle the work to read and write data for GC and to reset the zones,
  * including handling all completions.
  *
  * Note that the order of the chunks is preserved so that we don't undo the
  * optimal order established by xfs_zone_gc_query().
  */
 static bool
 xfs_zone_gc_handle_work(
 	struct xfs_zone_gc_data	*data)
 {
 	struct xfs_zone_info	*zi = data->mp->m_zone_info;
 	struct xfs_gc_bio	*chunk, *next;
 	struct xfs_group	*reset_list;
 	struct blk_plug		plug;

 	spin_lock(&zi->zi_reset_list_lock);
 	reset_list = zi->zi_reset_list;
 	zi->zi_reset_list = NULL;
 	spin_unlock(&zi->zi_reset_list_lock);

 	if (!xfs_zone_gc_select_victim(data) ||
 	    !xfs_zone_gc_space_available(data)) {
 		if (list_empty(&data->reading) &&
 		    list_empty(&data->writing) &&
 		    list_empty(&data->resetting) &&
 		    !reset_list)
 			return false;
 	}

 	__set_current_state(TASK_RUNNING);
 	try_to_freeze();

 	if (reset_list)
 		xfs_zone_gc_reset_zones(data, reset_list);

 	list_for_each_entry_safe(chunk, next, &data->resetting, entry) {
 		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
 			break;
 		xfs_zone_gc_finish_reset(chunk);
 	}

 	list_for_each_entry_safe(chunk, next, &data->writing, entry) {
 		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
 			break;
 		xfs_zone_gc_finish_chunk(chunk);
 	}

 	blk_start_plug(&plug);
 	list_for_each_entry_safe(chunk, next, &data->reading, entry) {
 		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
 			break;
 		xfs_zone_gc_write_chunk(chunk);
 	}
 	blk_finish_plug(&plug);

 	blk_start_plug(&plug);
 	while (xfs_zone_gc_start_chunk(data))
 		;
 	blk_finish_plug(&plug);
 	return true;
 }

 /*
  * Note that the current GC algorithm would break reflinks and thus duplicate
  * data that was shared by multiple owners before.  Because of that reflinks
  * are currently not supported on zoned file systems and can't be created or
  * mounted.
  */
 static int
 xfs_zoned_gcd(
 	void			*private)
 {
 	struct xfs_zone_gc_data	*data = private;
 	struct xfs_mount	*mp = data->mp;
 	struct xfs_zone_info	*zi = mp->m_zone_info;
 	unsigned int		nofs_flag;

 	nofs_flag = memalloc_nofs_save();
 	set_freezable();

 	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
 		xfs_set_zonegc_running(mp);
 		if (xfs_zone_gc_handle_work(data))
 			continue;

 		if (list_empty(&data->reading) &&
 		    list_empty(&data->writing) &&
 		    list_empty(&data->resetting) &&
 		    !zi->zi_reset_list) {
 			xfs_clear_zonegc_running(mp);
 			xfs_zoned_resv_wake_all(mp);

 			if (kthread_should_stop()) {
 				__set_current_state(TASK_RUNNING);
 				break;
 			}

 			if (kthread_should_park()) {
 				__set_current_state(TASK_RUNNING);
 				kthread_parkme();
 				continue;
 			}
 		}

 		schedule();
 	}
 	xfs_clear_zonegc_running(mp);

 	if (data->iter.victim_rtg)
 		xfs_rtgroup_rele(data->iter.victim_rtg);

 	memalloc_nofs_restore(nofs_flag);
 	xfs_zone_gc_data_free(data);
 	return 0;
 }

 void
 xfs_zone_gc_start(
 	struct xfs_mount	*mp)
 {
 	if (xfs_has_zoned(mp))
 		kthread_unpark(mp->m_zone_info->zi_gc_thread);
 }

 void
 xfs_zone_gc_stop(
 	struct xfs_mount	*mp)
 {
 	if (xfs_has_zoned(mp))
 		kthread_park(mp->m_zone_info->zi_gc_thread);
 }

 int
 xfs_zone_gc_mount(
 	struct xfs_mount	*mp)
 {
 	struct xfs_zone_info	*zi = mp->m_zone_info;
 	struct xfs_zone_gc_data	*data;
 	struct xfs_open_zone	*oz;
 	int			error;

 	/*
 	 * If there are no free zones available for GC, pick the open zone with
 	 * the least used space to GC into.  This should only happen after an
 	 * unclean shutdown near ENOSPC while GC was ongoing.
 	 *
 	 * We also need to do this for the first gc zone allocation if we
 	 * unmounted while at the open limit.
 	 */
 	if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) ||
 	    zi->zi_nr_open_zones == mp->m_max_open_zones)
 		oz = xfs_zone_gc_steal_open(zi);
 	else
 		oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
 	if (!oz) {
 		xfs_warn(mp, "unable to allocate a zone for gc");
 		error = -EIO;
 		goto out;
 	}

 	trace_xfs_zone_gc_target_opened(oz->oz_rtg);
 	zi->zi_open_gc_zone = oz;

 	data = xfs_zone_gc_data_alloc(mp);
 	if (!data) {
 		error = -ENOMEM;
 		goto out_put_gc_zone;
 	}

 	mp->m_zone_info->zi_gc_thread = kthread_create(xfs_zoned_gcd, data,
 			"xfs-zone-gc/%s", mp->m_super->s_id);
 	if (IS_ERR(mp->m_zone_info->zi_gc_thread)) {
 		xfs_warn(mp, "unable to create zone gc thread");
 		error = PTR_ERR(mp->m_zone_info->zi_gc_thread);
 		goto out_free_gc_data;
 	}

 	/* xfs_zone_gc_start will unpark for rw mounts */
 	kthread_park(mp->m_zone_info->zi_gc_thread);
 	return 0;

 out_free_gc_data:
 	kfree(data);
 out_put_gc_zone:
 	xfs_open_zone_put(zi->zi_open_gc_zone);
 out:
 	return error;
 }

 void
 xfs_zone_gc_unmount(
 	struct xfs_mount	*mp)
 {
 	struct xfs_zone_info	*zi = mp->m_zone_info;

 	kthread_stop(zi->zi_gc_thread);
 	if (zi->zi_open_gc_zone)
 		xfs_open_zone_put(zi->zi_open_gc_zone);
 }