fs/xfs/xfs_zone_space_resv.c - linux - Git at Google

 // SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (c) 2023-2025 Christoph Hellwig.
  * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
  */
 #include "xfs.h"
 #include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_rtbitmap.h"
 #include "xfs_zone_alloc.h"
 #include "xfs_zone_priv.h"
 #include "xfs_zones.h"

 /*
  * Note: the zoned allocator does not support a rtextsize > 1, so this code and
  * the allocator itself uses file system blocks interchangeable with realtime
  * extents without doing the otherwise required conversions.
  */

 /*
  * Per-task space reservation.
  *
  * Tasks that need to wait for GC to free up space allocate one of these
  * on-stack and adds it to the per-mount zi_reclaim_reservations lists.
  * The GC thread will then wake the tasks in order when space becomes available.
  */
 struct xfs_zone_reservation {
 	struct list_head	entry;
 	struct task_struct	*task;
 	xfs_filblks_t		count_fsb;
 };

 /*
  * Calculate the number of reserved blocks.
  *
  * XC_FREE_RTEXTENTS counts the user available capacity, to which the file
  * system can be filled, while XC_FREE_RTAVAILABLE counts the blocks instantly
  * available for writes without waiting for GC.
  *
  * For XC_FREE_RTAVAILABLE only the smaller reservation required for GC and
  * block zeroing is excluded from the user capacity, while XC_FREE_RTEXTENTS
  * is further restricted by at least one zone as well as the optional
  * persistently reserved blocks.  This allows the allocator to run more
  * smoothly by not always triggering GC.
  */
 uint64_t
 xfs_zoned_default_resblks(
 	struct xfs_mount	*mp,
 	enum xfs_free_counter	ctr)
 {
 	switch (ctr) {
 	case XC_FREE_RTEXTENTS:
 		return (uint64_t)XFS_RESERVED_ZONES *
 			mp->m_groups[XG_TYPE_RTG].blocks +
 			mp->m_sb.sb_rtreserved;
 	case XC_FREE_RTAVAILABLE:
 		return (uint64_t)XFS_GC_ZONES *
 			mp->m_groups[XG_TYPE_RTG].blocks;
 	default:
 		ASSERT(0);
 		return 0;
 	}
 }

 void
 xfs_zoned_resv_wake_all(
 	struct xfs_mount		*mp)
 {
 	struct xfs_zone_info		*zi = mp->m_zone_info;
 	struct xfs_zone_reservation	*reservation;

 	spin_lock(&zi->zi_reservation_lock);
 	list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry)
 		wake_up_process(reservation->task);
 	spin_unlock(&zi->zi_reservation_lock);
 }

 void
 xfs_zoned_add_available(
 	struct xfs_mount		*mp,
 	xfs_filblks_t			count_fsb)
 {
 	struct xfs_zone_info		*zi = mp->m_zone_info;
 	struct xfs_zone_reservation	*reservation;

 	if (list_empty_careful(&zi->zi_reclaim_reservations)) {
 		xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
 		return;
 	}

 	spin_lock(&zi->zi_reservation_lock);
 	xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
 	count_fsb = xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE);
 	list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry) {
 		if (reservation->count_fsb > count_fsb)
 			break;
 		wake_up_process(reservation->task);
 		count_fsb -= reservation->count_fsb;

 	}
 	spin_unlock(&zi->zi_reservation_lock);
 }

 static int
 xfs_zoned_space_wait_error(
 	struct xfs_mount		*mp)
 {
 	if (xfs_is_shutdown(mp))
 		return -EIO;
 	if (fatal_signal_pending(current))
 		return -EINTR;
 	return 0;
 }

 static int
 xfs_zoned_reserve_available(
 	struct xfs_mount		*mp,
 	xfs_filblks_t			count_fsb,
 	unsigned int			flags)
 {
 	struct xfs_zone_info		*zi = mp->m_zone_info;
 	struct xfs_zone_reservation	reservation = {
 		.task		= current,
 		.count_fsb	= count_fsb,
 	};
 	int				error;

 	/*
 	 * If there are no waiters, try to directly grab the available blocks
 	 * from the percpu counter.
 	 *
 	 * If the caller wants to dip into the reserved pool also bypass the
 	 * wait list.  This relies on the fact that we have a very graciously
 	 * sized reserved pool that always has enough space.  If the reserved
 	 * allocations fail we're in trouble.
 	 */
 	if (likely(list_empty_careful(&zi->zi_reclaim_reservations) ||
 	    (flags & XFS_ZR_RESERVED))) {
 		error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
 				flags & XFS_ZR_RESERVED);
 		if (error != -ENOSPC)
 			return error;
 	}

 	if (flags & XFS_ZR_NOWAIT)
 		return -EAGAIN;

 	spin_lock(&zi->zi_reservation_lock);
 	list_add_tail(&reservation.entry, &zi->zi_reclaim_reservations);
 	while ((error = xfs_zoned_space_wait_error(mp)) == 0) {
 		set_current_state(TASK_KILLABLE);

 		error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
 				flags & XFS_ZR_RESERVED);
 		if (error != -ENOSPC)
 			break;

 		/*
 		 * Make sure to start GC if it is not running already. As we
 		 * check the rtavailable count when filling up zones, GC is
 		 * normally already running at this point, but in some setups
 		 * with very few zones we may completely run out of non-
 		 * reserved blocks in between filling zones.
 		 */
 		if (!xfs_is_zonegc_running(mp))
 			wake_up_process(zi->zi_gc_thread);

 		/*
 		 * If there is no reclaimable group left and we aren't still
 		 * processing a pending GC request give up as we're fully out
 		 * of space.
 		 */
 		if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE) &&
 		    !xfs_is_zonegc_running(mp))
 			break;

 		spin_unlock(&zi->zi_reservation_lock);
 		schedule();
 		spin_lock(&zi->zi_reservation_lock);
 	}
 	list_del(&reservation.entry);
 	spin_unlock(&zi->zi_reservation_lock);

 	__set_current_state(TASK_RUNNING);
 	return error;
 }

 /*
  * Implement greedy space allocation for short writes by trying to grab all
  * that is left after locking out other threads from trying to do the same.
  *
  * This isn't exactly optimal and can hopefully be replaced by a proper
  * percpu_counter primitive one day.
  */
 static int
 xfs_zoned_reserve_extents_greedy(
 	struct xfs_mount		*mp,
 	xfs_filblks_t			*count_fsb,
 	unsigned int			flags)
 {
 	struct xfs_zone_info		*zi = mp->m_zone_info;
 	s64				len = *count_fsb;
 	int				error = -ENOSPC;

 	spin_lock(&zi->zi_reservation_lock);
 	len = min(len, xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS));
 	if (len > 0) {
 		*count_fsb = len;
 		error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, *count_fsb,
 				flags & XFS_ZR_RESERVED);
 	}
 	spin_unlock(&zi->zi_reservation_lock);
 	return error;
 }

 int
 xfs_zoned_space_reserve(
 	struct xfs_mount		*mp,
 	xfs_filblks_t			count_fsb,
 	unsigned int			flags,
 	struct xfs_zone_alloc_ctx	*ac)
 {
 	int				error;

 	ASSERT(ac->reserved_blocks == 0);
 	ASSERT(ac->open_zone == NULL);

 	error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb,
 			flags & XFS_ZR_RESERVED);
 	if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && count_fsb > 1)
 		error = xfs_zoned_reserve_extents_greedy(mp, &count_fsb, flags);
 	if (error)
 		return error;

 	error = xfs_zoned_reserve_available(mp, count_fsb, flags);
 	if (error) {
 		xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb);
 		return error;
 	}
 	ac->reserved_blocks = count_fsb;
 	return 0;
 }

 void
 xfs_zoned_space_unreserve(
 	struct xfs_mount		*mp,
 	struct xfs_zone_alloc_ctx	*ac)
 {
 	if (ac->reserved_blocks > 0) {
 		xfs_zoned_add_available(mp, ac->reserved_blocks);
 		xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, ac->reserved_blocks);
 	}
 	if (ac->open_zone)
 		xfs_open_zone_put(ac->open_zone);
 }
	// SPDX-License-Identifier: GPL-2.0
	/*
	* Copyright (c) 2023-2025 Christoph Hellwig.
	* Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
	*/
	#include "xfs.h"
	#include "xfs_shared.h"
	#include "xfs_format.h"
	#include "xfs_trans_resv.h"
	#include "xfs_mount.h"
	#include "xfs_inode.h"
	#include "xfs_rtbitmap.h"
	#include "xfs_zone_alloc.h"
	#include "xfs_zone_priv.h"
	#include "xfs_zones.h"

	/*
	* Note: the zoned allocator does not support a rtextsize > 1, so this code and
	* the allocator itself uses file system blocks interchangeable with realtime
	* extents without doing the otherwise required conversions.
	*/

	/*
	* Per-task space reservation.
	*
	* Tasks that need to wait for GC to free up space allocate one of these
	* on-stack and adds it to the per-mount zi_reclaim_reservations lists.
	* The GC thread will then wake the tasks in order when space becomes available.
	*/
	struct xfs_zone_reservation {
	struct list_head entry;
	struct task_struct *task;
	xfs_filblks_t count_fsb;
	};

	/*
	* Calculate the number of reserved blocks.
	*
	* XC_FREE_RTEXTENTS counts the user available capacity, to which the file
	* system can be filled, while XC_FREE_RTAVAILABLE counts the blocks instantly
	* available for writes without waiting for GC.
	*
	* For XC_FREE_RTAVAILABLE only the smaller reservation required for GC and
	* block zeroing is excluded from the user capacity, while XC_FREE_RTEXTENTS
	* is further restricted by at least one zone as well as the optional
	* persistently reserved blocks. This allows the allocator to run more
	* smoothly by not always triggering GC.
	*/
	uint64_t
	xfs_zoned_default_resblks(
	struct xfs_mount *mp,
	enum xfs_free_counter ctr)
	{
	switch (ctr) {
	case XC_FREE_RTEXTENTS:
	return (uint64_t)XFS_RESERVED_ZONES *
	mp->m_groups[XG_TYPE_RTG].blocks +
	mp->m_sb.sb_rtreserved;
	case XC_FREE_RTAVAILABLE:
	return (uint64_t)XFS_GC_ZONES *
	mp->m_groups[XG_TYPE_RTG].blocks;
	default:
	ASSERT(0);
	return 0;
	}
	}

	void
	xfs_zoned_resv_wake_all(
	struct xfs_mount *mp)
	{
	struct xfs_zone_info *zi = mp->m_zone_info;
	struct xfs_zone_reservation *reservation;

	spin_lock(&zi->zi_reservation_lock);
	list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry)
	wake_up_process(reservation->task);
	spin_unlock(&zi->zi_reservation_lock);
	}

	void
	xfs_zoned_add_available(
	struct xfs_mount *mp,
	xfs_filblks_t count_fsb)
	{
	struct xfs_zone_info *zi = mp->m_zone_info;
	struct xfs_zone_reservation *reservation;

	if (list_empty_careful(&zi->zi_reclaim_reservations)) {
	xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
	return;
	}

	spin_lock(&zi->zi_reservation_lock);
	xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
	count_fsb = xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE);
	list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry) {
	if (reservation->count_fsb > count_fsb)
	break;
	wake_up_process(reservation->task);
	count_fsb -= reservation->count_fsb;

	}
	spin_unlock(&zi->zi_reservation_lock);
	}

	static int
	xfs_zoned_space_wait_error(
	struct xfs_mount *mp)
	{
	if (xfs_is_shutdown(mp))
	return -EIO;
	if (fatal_signal_pending(current))
	return -EINTR;
	return 0;
	}

	static int
	xfs_zoned_reserve_available(
	struct xfs_mount *mp,
	xfs_filblks_t count_fsb,
	unsigned int flags)
	{
	struct xfs_zone_info *zi = mp->m_zone_info;
	struct xfs_zone_reservation reservation = {
	.task = current,
	.count_fsb = count_fsb,
	};
	int error;

	/*
	* If there are no waiters, try to directly grab the available blocks
	* from the percpu counter.
	*
	* If the caller wants to dip into the reserved pool also bypass the
	* wait list. This relies on the fact that we have a very graciously
	* sized reserved pool that always has enough space. If the reserved
	* allocations fail we're in trouble.
	*/
	if (likely(list_empty_careful(&zi->zi_reclaim_reservations) \|\|
	(flags & XFS_ZR_RESERVED))) {
	error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
	flags & XFS_ZR_RESERVED);
	if (error != -ENOSPC)
	return error;
	}

	if (flags & XFS_ZR_NOWAIT)
	return -EAGAIN;

	spin_lock(&zi->zi_reservation_lock);
	list_add_tail(&reservation.entry, &zi->zi_reclaim_reservations);
	while ((error = xfs_zoned_space_wait_error(mp)) == 0) {
	set_current_state(TASK_KILLABLE);

	error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
	flags & XFS_ZR_RESERVED);
	if (error != -ENOSPC)
	break;

	/*
	* Make sure to start GC if it is not running already. As we
	* check the rtavailable count when filling up zones, GC is
	* normally already running at this point, but in some setups
	* with very few zones we may completely run out of non-
	* reserved blocks in between filling zones.
	*/
	if (!xfs_is_zonegc_running(mp))
	wake_up_process(zi->zi_gc_thread);

	/*
	* If there is no reclaimable group left and we aren't still
	* processing a pending GC request give up as we're fully out
	* of space.
	*/
	if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE) &&
	!xfs_is_zonegc_running(mp))
	break;

	spin_unlock(&zi->zi_reservation_lock);
	schedule();
	spin_lock(&zi->zi_reservation_lock);
	}
	list_del(&reservation.entry);
	spin_unlock(&zi->zi_reservation_lock);

	__set_current_state(TASK_RUNNING);
	return error;
	}

	/*
	* Implement greedy space allocation for short writes by trying to grab all
	* that is left after locking out other threads from trying to do the same.
	*
	* This isn't exactly optimal and can hopefully be replaced by a proper
	* percpu_counter primitive one day.
	*/
	static int
	xfs_zoned_reserve_extents_greedy(
	struct xfs_mount *mp,
	xfs_filblks_t *count_fsb,
	unsigned int flags)
	{
	struct xfs_zone_info *zi = mp->m_zone_info;
	s64 len = *count_fsb;
	int error = -ENOSPC;

	spin_lock(&zi->zi_reservation_lock);
	len = min(len, xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS));
	if (len > 0) {
	*count_fsb = len;
	error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, *count_fsb,
	flags & XFS_ZR_RESERVED);
	}
	spin_unlock(&zi->zi_reservation_lock);
	return error;
	}

	int
	xfs_zoned_space_reserve(
	struct xfs_mount *mp,
	xfs_filblks_t count_fsb,
	unsigned int flags,
	struct xfs_zone_alloc_ctx *ac)
	{
	int error;

	ASSERT(ac->reserved_blocks == 0);
	ASSERT(ac->open_zone == NULL);

	error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb,
	flags & XFS_ZR_RESERVED);
	if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && count_fsb > 1)
	error = xfs_zoned_reserve_extents_greedy(mp, &count_fsb, flags);
	if (error)
	return error;

	error = xfs_zoned_reserve_available(mp, count_fsb, flags);
	if (error) {
	xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb);
	return error;
	}
	ac->reserved_blocks = count_fsb;
	return 0;
	}

	void
	xfs_zoned_space_unreserve(
	struct xfs_mount *mp,
	struct xfs_zone_alloc_ctx *ac)
	{
	if (ac->reserved_blocks > 0) {
	xfs_zoned_add_available(mp, ac->reserved_blocks);
	xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, ac->reserved_blocks);
	}
	if (ac->open_zone)
	xfs_open_zone_put(ac->open_zone);
	}