drivers/block/zloop.c - linux - Git at Google

 // SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (c) 2025, Christoph Hellwig.
  * Copyright (c) 2025, Western Digital Corporation or its affiliates.
  *
  * Zoned Loop Device driver - exports a zoned block device using one file per
  * zone as backing storage.
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

 #include <linux/module.h>
 #include <linux/blk-mq.h>
 #include <linux/blkzoned.h>
 #include <linux/pagemap.h>
 #include <linux/miscdevice.h>
 #include <linux/falloc.h>
 #include <linux/mutex.h>
 #include <linux/parser.h>
 #include <linux/seq_file.h>

 /*
  * Options for adding (and removing) a device.
  */
 enum {
 	ZLOOP_OPT_ERR			= 0,
 	ZLOOP_OPT_ID			= (1 << 0),
 	ZLOOP_OPT_CAPACITY		= (1 << 1),
 	ZLOOP_OPT_ZONE_SIZE		= (1 << 2),
 	ZLOOP_OPT_ZONE_CAPACITY		= (1 << 3),
 	ZLOOP_OPT_NR_CONV_ZONES		= (1 << 4),
 	ZLOOP_OPT_BASE_DIR		= (1 << 5),
 	ZLOOP_OPT_NR_QUEUES		= (1 << 6),
 	ZLOOP_OPT_QUEUE_DEPTH		= (1 << 7),
 	ZLOOP_OPT_BUFFERED_IO		= (1 << 8),
 };

 static const match_table_t zloop_opt_tokens = {
 	{ ZLOOP_OPT_ID,			"id=%d"	},
 	{ ZLOOP_OPT_CAPACITY,		"capacity_mb=%u"	},
 	{ ZLOOP_OPT_ZONE_SIZE,		"zone_size_mb=%u"	},
 	{ ZLOOP_OPT_ZONE_CAPACITY,	"zone_capacity_mb=%u"	},
 	{ ZLOOP_OPT_NR_CONV_ZONES,	"conv_zones=%u"		},
 	{ ZLOOP_OPT_BASE_DIR,		"base_dir=%s"		},
 	{ ZLOOP_OPT_NR_QUEUES,		"nr_queues=%u"		},
 	{ ZLOOP_OPT_QUEUE_DEPTH,	"queue_depth=%u"	},
 	{ ZLOOP_OPT_BUFFERED_IO,	"buffered_io"		},
 	{ ZLOOP_OPT_ERR,		NULL			}
 };

 /* Default values for the "add" operation. */
 #define ZLOOP_DEF_ID			-1
 #define ZLOOP_DEF_ZONE_SIZE		((256ULL * SZ_1M) >> SECTOR_SHIFT)
 #define ZLOOP_DEF_NR_ZONES		64
 #define ZLOOP_DEF_NR_CONV_ZONES		8
 #define ZLOOP_DEF_BASE_DIR		"/var/local/zloop"
 #define ZLOOP_DEF_NR_QUEUES		1
 #define ZLOOP_DEF_QUEUE_DEPTH		128
 #define ZLOOP_DEF_BUFFERED_IO		false

 /* Arbitrary limit on the zone size (16GB). */
 #define ZLOOP_MAX_ZONE_SIZE_MB		16384

 struct zloop_options {
 	unsigned int		mask;
 	int			id;
 	sector_t		capacity;
 	sector_t		zone_size;
 	sector_t		zone_capacity;
 	unsigned int		nr_conv_zones;
 	char			*base_dir;
 	unsigned int		nr_queues;
 	unsigned int		queue_depth;
 	bool			buffered_io;
 };

 /*
  * Device states.
  */
 enum {
 	Zlo_creating = 0,
 	Zlo_live,
 	Zlo_deleting,
 };

 enum zloop_zone_flags {
 	ZLOOP_ZONE_CONV = 0,
 	ZLOOP_ZONE_SEQ_ERROR,
 };

 struct zloop_zone {
 	struct file		*file;

 	unsigned long		flags;
 	struct mutex		lock;
 	enum blk_zone_cond	cond;
 	sector_t		start;
 	sector_t		wp;

 	gfp_t			old_gfp_mask;
 };

 struct zloop_device {
 	unsigned int		id;
 	unsigned int		state;

 	struct blk_mq_tag_set	tag_set;
 	struct gendisk		*disk;

 	struct workqueue_struct *workqueue;
 	bool			buffered_io;

 	const char		*base_dir;
 	struct file		*data_dir;

 	unsigned int		zone_shift;
 	sector_t		zone_size;
 	sector_t		zone_capacity;
 	unsigned int		nr_zones;
 	unsigned int		nr_conv_zones;
 	unsigned int		block_size;

 	struct zloop_zone	zones[] __counted_by(nr_zones);
 };

 struct zloop_cmd {
 	struct work_struct	work;
 	atomic_t		ref;
 	sector_t		sector;
 	sector_t		nr_sectors;
 	long			ret;
 	struct kiocb		iocb;
 	struct bio_vec		*bvec;
 };

 static DEFINE_IDR(zloop_index_idr);
 static DEFINE_MUTEX(zloop_ctl_mutex);

 static unsigned int rq_zone_no(struct request *rq)
 {
 	struct zloop_device *zlo = rq->q->queuedata;

 	return blk_rq_pos(rq) >> zlo->zone_shift;
 }

 static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no)
 {
 	struct zloop_zone *zone = &zlo->zones[zone_no];
 	struct kstat stat;
 	sector_t file_sectors;
 	int ret;

 	lockdep_assert_held(&zone->lock);

 	ret = vfs_getattr(&zone->file->f_path, &stat, STATX_SIZE, 0);
 	if (ret < 0) {
 		pr_err("Failed to get zone %u file stat (err=%d)\n",
 		       zone_no, ret);
 		set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
 		return ret;
 	}

 	file_sectors = stat.size >> SECTOR_SHIFT;
 	if (file_sectors > zlo->zone_capacity) {
 		pr_err("Zone %u file too large (%llu sectors > %llu)\n",
 		       zone_no, file_sectors, zlo->zone_capacity);
 		return -EINVAL;
 	}

 	if (file_sectors & ((zlo->block_size >> SECTOR_SHIFT) - 1)) {
 		pr_err("Zone %u file size not aligned to block size %u\n",
 		       zone_no, zlo->block_size);
 		return -EINVAL;
 	}

 	if (!file_sectors) {
 		zone->cond = BLK_ZONE_COND_EMPTY;
 		zone->wp = zone->start;
 	} else if (file_sectors == zlo->zone_capacity) {
 		zone->cond = BLK_ZONE_COND_FULL;
 		zone->wp = zone->start + zlo->zone_size;
 	} else {
 		zone->cond = BLK_ZONE_COND_CLOSED;
 		zone->wp = zone->start + file_sectors;
 	}

 	return 0;
 }

 static int zloop_open_zone(struct zloop_device *zlo, unsigned int zone_no)
 {
 	struct zloop_zone *zone = &zlo->zones[zone_no];
 	int ret = 0;

 	if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
 		return -EIO;

 	mutex_lock(&zone->lock);

 	if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
 		ret = zloop_update_seq_zone(zlo, zone_no);
 		if (ret)
 			goto unlock;
 	}

 	switch (zone->cond) {
 	case BLK_ZONE_COND_EXP_OPEN:
 		break;
 	case BLK_ZONE_COND_EMPTY:
 	case BLK_ZONE_COND_CLOSED:
 	case BLK_ZONE_COND_IMP_OPEN:
 		zone->cond = BLK_ZONE_COND_EXP_OPEN;
 		break;
 	case BLK_ZONE_COND_FULL:
 	default:
 		ret = -EIO;
 		break;
 	}

 unlock:
 	mutex_unlock(&zone->lock);

 	return ret;
 }

 static int zloop_close_zone(struct zloop_device *zlo, unsigned int zone_no)
 {
 	struct zloop_zone *zone = &zlo->zones[zone_no];
 	int ret = 0;

 	if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
 		return -EIO;

 	mutex_lock(&zone->lock);

 	if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
 		ret = zloop_update_seq_zone(zlo, zone_no);
 		if (ret)
 			goto unlock;
 	}

 	switch (zone->cond) {
 	case BLK_ZONE_COND_CLOSED:
 		break;
 	case BLK_ZONE_COND_IMP_OPEN:
 	case BLK_ZONE_COND_EXP_OPEN:
 		if (zone->wp == zone->start)
 			zone->cond = BLK_ZONE_COND_EMPTY;
 		else
 			zone->cond = BLK_ZONE_COND_CLOSED;
 		break;
 	case BLK_ZONE_COND_EMPTY:
 	case BLK_ZONE_COND_FULL:
 	default:
 		ret = -EIO;
 		break;
 	}

 unlock:
 	mutex_unlock(&zone->lock);

 	return ret;
 }

 static int zloop_reset_zone(struct zloop_device *zlo, unsigned int zone_no)
 {
 	struct zloop_zone *zone = &zlo->zones[zone_no];
 	int ret = 0;

 	if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
 		return -EIO;

 	mutex_lock(&zone->lock);

 	if (!test_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags) &&
 	    zone->cond == BLK_ZONE_COND_EMPTY)
 		goto unlock;

 	if (vfs_truncate(&zone->file->f_path, 0)) {
 		set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
 		ret = -EIO;
 		goto unlock;
 	}

 	zone->cond = BLK_ZONE_COND_EMPTY;
 	zone->wp = zone->start;
 	clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);

 unlock:
 	mutex_unlock(&zone->lock);

 	return ret;
 }

 static int zloop_reset_all_zones(struct zloop_device *zlo)
 {
 	unsigned int i;
 	int ret;

 	for (i = zlo->nr_conv_zones; i < zlo->nr_zones; i++) {
 		ret = zloop_reset_zone(zlo, i);
 		if (ret)
 			return ret;
 	}

 	return 0;
 }

 static int zloop_finish_zone(struct zloop_device *zlo, unsigned int zone_no)
 {
 	struct zloop_zone *zone = &zlo->zones[zone_no];
 	int ret = 0;

 	if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
 		return -EIO;

 	mutex_lock(&zone->lock);

 	if (!test_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags) &&
 	    zone->cond == BLK_ZONE_COND_FULL)
 		goto unlock;

 	if (vfs_truncate(&zone->file->f_path, zlo->zone_size << SECTOR_SHIFT)) {
 		set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
 		ret = -EIO;
 		goto unlock;
 	}

 	zone->cond = BLK_ZONE_COND_FULL;
 	zone->wp = zone->start + zlo->zone_size;
 	clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);

  unlock:
 	mutex_unlock(&zone->lock);

 	return ret;
 }

 static void zloop_put_cmd(struct zloop_cmd *cmd)
 {
 	struct request *rq = blk_mq_rq_from_pdu(cmd);

 	if (!atomic_dec_and_test(&cmd->ref))
 		return;
 	kfree(cmd->bvec);
 	cmd->bvec = NULL;
 	if (likely(!blk_should_fake_timeout(rq->q)))
 		blk_mq_complete_request(rq);
 }

 static void zloop_rw_complete(struct kiocb *iocb, long ret)
 {
 	struct zloop_cmd *cmd = container_of(iocb, struct zloop_cmd, iocb);

 	cmd->ret = ret;
 	zloop_put_cmd(cmd);
 }

 static void zloop_rw(struct zloop_cmd *cmd)
 {
 	struct request *rq = blk_mq_rq_from_pdu(cmd);
 	struct zloop_device *zlo = rq->q->queuedata;
 	unsigned int zone_no = rq_zone_no(rq);
 	sector_t sector = blk_rq_pos(rq);
 	sector_t nr_sectors = blk_rq_sectors(rq);
 	bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND;
 	bool is_write = req_op(rq) == REQ_OP_WRITE || is_append;
 	int rw = is_write ? ITER_SOURCE : ITER_DEST;
 	struct req_iterator rq_iter;
 	struct zloop_zone *zone;
 	struct iov_iter iter;
 	struct bio_vec tmp;
 	sector_t zone_end;
 	int nr_bvec = 0;
 	int ret;

 	atomic_set(&cmd->ref, 2);
 	cmd->sector = sector;
 	cmd->nr_sectors = nr_sectors;
 	cmd->ret = 0;

 	/* We should never get an I/O beyond the device capacity. */
 	if (WARN_ON_ONCE(zone_no >= zlo->nr_zones)) {
 		ret = -EIO;
 		goto out;
 	}
 	zone = &zlo->zones[zone_no];
 	zone_end = zone->start + zlo->zone_capacity;

 	/*
 	 * The block layer should never send requests that are not fully
 	 * contained within the zone.
 	 */
 	if (WARN_ON_ONCE(sector + nr_sectors > zone->start + zlo->zone_size)) {
 		ret = -EIO;
 		goto out;
 	}

 	if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
 		mutex_lock(&zone->lock);
 		ret = zloop_update_seq_zone(zlo, zone_no);
 		mutex_unlock(&zone->lock);
 		if (ret)
 			goto out;
 	}

 	if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) {
 		mutex_lock(&zone->lock);

 		if (is_append) {
 			sector = zone->wp;
 			cmd->sector = sector;
 		}

 		/*
 		 * Write operations must be aligned to the write pointer and
 		 * fully contained within the zone capacity.
 		 */
 		if (sector != zone->wp || zone->wp + nr_sectors > zone_end) {
 			pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n",
 			       zone_no, sector, zone->wp);
 			ret = -EIO;
 			goto unlock;
 		}

 		/* Implicitly open the target zone. */
 		if (zone->cond == BLK_ZONE_COND_CLOSED ||
 		    zone->cond == BLK_ZONE_COND_EMPTY)
 			zone->cond = BLK_ZONE_COND_IMP_OPEN;

 		/*
 		 * Advance the write pointer of sequential zones. If the write
 		 * fails, the wp position will be corrected when the next I/O
 		 * copmpletes.
 		 */
 		zone->wp += nr_sectors;
 		if (zone->wp == zone_end)
 			zone->cond = BLK_ZONE_COND_FULL;
 	}

 	rq_for_each_bvec(tmp, rq, rq_iter)
 		nr_bvec++;

 	if (rq->bio != rq->biotail) {
 		struct bio_vec *bvec;

 		cmd->bvec = kmalloc_array(nr_bvec, sizeof(*cmd->bvec), GFP_NOIO);
 		if (!cmd->bvec) {
 			ret = -EIO;
 			goto unlock;
 		}

 		/*
 		 * The bios of the request may be started from the middle of
 		 * the 'bvec' because of bio splitting, so we can't directly
 		 * copy bio->bi_iov_vec to new bvec. The rq_for_each_bvec
 		 * API will take care of all details for us.
 		 */
 		bvec = cmd->bvec;
 		rq_for_each_bvec(tmp, rq, rq_iter) {
 			*bvec = tmp;
 			bvec++;
 		}
 		iov_iter_bvec(&iter, rw, cmd->bvec, nr_bvec, blk_rq_bytes(rq));
 	} else {
 		/*
 		 * Same here, this bio may be started from the middle of the
 		 * 'bvec' because of bio splitting, so offset from the bvec
 		 * must be passed to iov iterator
 		 */
 		iov_iter_bvec(&iter, rw,
 			__bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter),
 					nr_bvec, blk_rq_bytes(rq));
 		iter.iov_offset = rq->bio->bi_iter.bi_bvec_done;
 	}

 	cmd->iocb.ki_pos = (sector - zone->start) << SECTOR_SHIFT;
 	cmd->iocb.ki_filp = zone->file;
 	cmd->iocb.ki_complete = zloop_rw_complete;
 	if (!zlo->buffered_io)
 		cmd->iocb.ki_flags = IOCB_DIRECT;
 	cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);

 	if (rw == ITER_SOURCE)
 		ret = zone->file->f_op->write_iter(&cmd->iocb, &iter);
 	else
 		ret = zone->file->f_op->read_iter(&cmd->iocb, &iter);
 unlock:
 	if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write)
 		mutex_unlock(&zone->lock);
 out:
 	if (ret != -EIOCBQUEUED)
 		zloop_rw_complete(&cmd->iocb, ret);
 	zloop_put_cmd(cmd);
 }

 static void zloop_handle_cmd(struct zloop_cmd *cmd)
 {
 	struct request *rq = blk_mq_rq_from_pdu(cmd);
 	struct zloop_device *zlo = rq->q->queuedata;

 	switch (req_op(rq)) {
 	case REQ_OP_READ:
 	case REQ_OP_WRITE:
 	case REQ_OP_ZONE_APPEND:
 		/*
 		 * zloop_rw() always executes asynchronously or completes
 		 * directly.
 		 */
 		zloop_rw(cmd);
 		return;
 	case REQ_OP_FLUSH:
 		/*
 		 * Sync the entire FS containing the zone files instead of
 		 * walking all files
 		 */
 		cmd->ret = sync_filesystem(file_inode(zlo->data_dir)->i_sb);
 		break;
 	case REQ_OP_ZONE_RESET:
 		cmd->ret = zloop_reset_zone(zlo, rq_zone_no(rq));
 		break;
 	case REQ_OP_ZONE_RESET_ALL:
 		cmd->ret = zloop_reset_all_zones(zlo);
 		break;
 	case REQ_OP_ZONE_FINISH:
 		cmd->ret = zloop_finish_zone(zlo, rq_zone_no(rq));
 		break;
 	case REQ_OP_ZONE_OPEN:
 		cmd->ret = zloop_open_zone(zlo, rq_zone_no(rq));
 		break;
 	case REQ_OP_ZONE_CLOSE:
 		cmd->ret = zloop_close_zone(zlo, rq_zone_no(rq));
 		break;
 	default:
 		WARN_ON_ONCE(1);
 		pr_err("Unsupported operation %d\n", req_op(rq));
 		cmd->ret = -EOPNOTSUPP;
 		break;
 	}

 	blk_mq_complete_request(rq);
 }

 static void zloop_cmd_workfn(struct work_struct *work)
 {
 	struct zloop_cmd *cmd = container_of(work, struct zloop_cmd, work);
 	int orig_flags = current->flags;

 	current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
 	zloop_handle_cmd(cmd);
 	current->flags = orig_flags;
 }

 static void zloop_complete_rq(struct request *rq)
 {
 	struct zloop_cmd *cmd = blk_mq_rq_to_pdu(rq);
 	struct zloop_device *zlo = rq->q->queuedata;
 	unsigned int zone_no = cmd->sector >> zlo->zone_shift;
 	struct zloop_zone *zone = &zlo->zones[zone_no];
 	blk_status_t sts = BLK_STS_OK;

 	switch (req_op(rq)) {
 	case REQ_OP_READ:
 		if (cmd->ret < 0)
 			pr_err("Zone %u: failed read sector %llu, %llu sectors\n",
 			       zone_no, cmd->sector, cmd->nr_sectors);

 		if (cmd->ret >= 0 && cmd->ret != blk_rq_bytes(rq)) {
 			/* short read */
 			struct bio *bio;

 			__rq_for_each_bio(bio, rq)
 				zero_fill_bio(bio);
 		}
 		break;
 	case REQ_OP_WRITE:
 	case REQ_OP_ZONE_APPEND:
 		if (cmd->ret < 0)
 			pr_err("Zone %u: failed %swrite sector %llu, %llu sectors\n",
 			       zone_no,
 			       req_op(rq) == REQ_OP_WRITE ? "" : "append ",
 			       cmd->sector, cmd->nr_sectors);

 		if (cmd->ret >= 0 && cmd->ret != blk_rq_bytes(rq)) {
 			pr_err("Zone %u: partial write %ld/%u B\n",
 			       zone_no, cmd->ret, blk_rq_bytes(rq));
 			cmd->ret = -EIO;
 		}

 		if (cmd->ret < 0 && !test_bit(ZLOOP_ZONE_CONV, &zone->flags)) {
 			/*
 			 * A write to a sequential zone file failed: mark the
 			 * zone as having an error. This will be corrected and
 			 * cleared when the next IO is submitted.
 			 */
 			set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
 			break;
 		}
 		if (req_op(rq) == REQ_OP_ZONE_APPEND)
 			rq->__sector = cmd->sector;

 		break;
 	default:
 		break;
 	}

 	if (cmd->ret < 0)
 		sts = errno_to_blk_status(cmd->ret);
 	blk_mq_end_request(rq, sts);
 }

 static blk_status_t zloop_queue_rq(struct blk_mq_hw_ctx *hctx,
 				   const struct blk_mq_queue_data *bd)
 {
 	struct request *rq = bd->rq;
 	struct zloop_cmd *cmd = blk_mq_rq_to_pdu(rq);
 	struct zloop_device *zlo = rq->q->queuedata;

 	if (zlo->state == Zlo_deleting)
 		return BLK_STS_IOERR;

 	blk_mq_start_request(rq);

 	INIT_WORK(&cmd->work, zloop_cmd_workfn);
 	queue_work(zlo->workqueue, &cmd->work);

 	return BLK_STS_OK;
 }

 static const struct blk_mq_ops zloop_mq_ops = {
 	.queue_rq       = zloop_queue_rq,
 	.complete	= zloop_complete_rq,
 };

 static int zloop_open(struct gendisk *disk, blk_mode_t mode)
 {
 	struct zloop_device *zlo = disk->private_data;
 	int ret;

 	ret = mutex_lock_killable(&zloop_ctl_mutex);
 	if (ret)
 		return ret;

 	if (zlo->state != Zlo_live)
 		ret = -ENXIO;
 	mutex_unlock(&zloop_ctl_mutex);
 	return ret;
 }

 static int zloop_report_zones(struct gendisk *disk, sector_t sector,
 		unsigned int nr_zones, report_zones_cb cb, void *data)
 {
 	struct zloop_device *zlo = disk->private_data;
 	struct blk_zone blkz = {};
 	unsigned int first, i;
 	int ret;

 	first = disk_zone_no(disk, sector);
 	if (first >= zlo->nr_zones)
 		return 0;
 	nr_zones = min(nr_zones, zlo->nr_zones - first);

 	for (i = 0; i < nr_zones; i++) {
 		unsigned int zone_no = first + i;
 		struct zloop_zone *zone = &zlo->zones[zone_no];

 		mutex_lock(&zone->lock);

 		if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
 			ret = zloop_update_seq_zone(zlo, zone_no);
 			if (ret) {
 				mutex_unlock(&zone->lock);
 				return ret;
 			}
 		}

 		blkz.start = zone->start;
 		blkz.len = zlo->zone_size;
 		blkz.wp = zone->wp;
 		blkz.cond = zone->cond;
 		if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) {
 			blkz.type = BLK_ZONE_TYPE_CONVENTIONAL;
 			blkz.capacity = zlo->zone_size;
 		} else {
 			blkz.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
 			blkz.capacity = zlo->zone_capacity;
 		}

 		mutex_unlock(&zone->lock);

 		ret = cb(&blkz, i, data);
 		if (ret)
 			return ret;
 	}

 	return nr_zones;
 }

 static void zloop_free_disk(struct gendisk *disk)
 {
 	struct zloop_device *zlo = disk->private_data;
 	unsigned int i;

 	for (i = 0; i < zlo->nr_zones; i++) {
 		struct zloop_zone *zone = &zlo->zones[i];

 		mapping_set_gfp_mask(zone->file->f_mapping,
 				zone->old_gfp_mask);
 		fput(zone->file);
 	}

 	fput(zlo->data_dir);
 	destroy_workqueue(zlo->workqueue);
 	kfree(zlo->base_dir);
 	kvfree(zlo);
 }

 static const struct block_device_operations zloop_fops = {
 	.owner			= THIS_MODULE,
 	.open			= zloop_open,
 	.report_zones		= zloop_report_zones,
 	.free_disk		= zloop_free_disk,
 };

 __printf(3, 4)
 static struct file *zloop_filp_open_fmt(int oflags, umode_t mode,
 		const char *fmt, ...)
 {
 	struct file *file;
 	va_list ap;
 	char *p;

 	va_start(ap, fmt);
 	p = kvasprintf(GFP_KERNEL, fmt, ap);
 	va_end(ap);

 	if (!p)
 		return ERR_PTR(-ENOMEM);
 	file = filp_open(p, oflags, mode);
 	kfree(p);
 	return file;
 }

 static int zloop_get_block_size(struct zloop_device *zlo,
 				struct zloop_zone *zone)
 {
 	struct block_device *sb_bdev = zone->file->f_mapping->host->i_sb->s_bdev;
 	struct kstat st;

 	/*
 	 * If the FS block size is lower than or equal to 4K, use that as the
 	 * device block size. Otherwise, fallback to the FS direct IO alignment
 	 * constraint if that is provided, and to the FS underlying device
 	 * physical block size if the direct IO alignment is unknown.
 	 */
 	if (file_inode(zone->file)->i_sb->s_blocksize <= SZ_4K)
 		zlo->block_size = file_inode(zone->file)->i_sb->s_blocksize;
 	else if (!vfs_getattr(&zone->file->f_path, &st, STATX_DIOALIGN, 0) &&
 		 (st.result_mask & STATX_DIOALIGN))
 		zlo->block_size = st.dio_offset_align;
 	else if (sb_bdev)
 		zlo->block_size = bdev_physical_block_size(sb_bdev);
 	else
 		zlo->block_size = SECTOR_SIZE;

 	if (zlo->zone_capacity & ((zlo->block_size >> SECTOR_SHIFT) - 1)) {
 		pr_err("Zone capacity is not aligned to block size %u\n",
 		       zlo->block_size);
 		return -EINVAL;
 	}

 	return 0;
 }

 static int zloop_init_zone(struct zloop_device *zlo, struct zloop_options *opts,
 			   unsigned int zone_no, bool restore)
 {
 	struct zloop_zone *zone = &zlo->zones[zone_no];
 	int oflags = O_RDWR;
 	struct kstat stat;
 	sector_t file_sectors;
 	int ret;

 	mutex_init(&zone->lock);
 	zone->start = (sector_t)zone_no << zlo->zone_shift;

 	if (!restore)
 		oflags |= O_CREAT;

 	if (!opts->buffered_io)
 		oflags |= O_DIRECT;

 	if (zone_no < zlo->nr_conv_zones) {
 		/* Conventional zone file. */
 		set_bit(ZLOOP_ZONE_CONV, &zone->flags);
 		zone->cond = BLK_ZONE_COND_NOT_WP;
 		zone->wp = U64_MAX;

 		zone->file = zloop_filp_open_fmt(oflags, 0600, "%s/%u/cnv-%06u",
 					zlo->base_dir, zlo->id, zone_no);
 		if (IS_ERR(zone->file)) {
 			pr_err("Failed to open zone %u file %s/%u/cnv-%06u (err=%ld)",
 			       zone_no, zlo->base_dir, zlo->id, zone_no,
 			       PTR_ERR(zone->file));
 			return PTR_ERR(zone->file);
 		}

 		if (!zlo->block_size) {
 			ret = zloop_get_block_size(zlo, zone);
 			if (ret)
 				return ret;
 		}

 		ret = vfs_getattr(&zone->file->f_path, &stat, STATX_SIZE, 0);
 		if (ret < 0) {
 			pr_err("Failed to get zone %u file stat\n", zone_no);
 			return ret;
 		}
 		file_sectors = stat.size >> SECTOR_SHIFT;

 		if (restore && file_sectors != zlo->zone_size) {
 			pr_err("Invalid conventional zone %u file size (%llu sectors != %llu)\n",
 			       zone_no, file_sectors, zlo->zone_capacity);
 			return ret;
 		}

 		ret = vfs_truncate(&zone->file->f_path,
 				   zlo->zone_size << SECTOR_SHIFT);
 		if (ret < 0) {
 			pr_err("Failed to truncate zone %u file (err=%d)\n",
 			       zone_no, ret);
 			return ret;
 		}

 		return 0;
 	}

 	/* Sequential zone file. */
 	zone->file = zloop_filp_open_fmt(oflags, 0600, "%s/%u/seq-%06u",
 					 zlo->base_dir, zlo->id, zone_no);
 	if (IS_ERR(zone->file)) {
 		pr_err("Failed to open zone %u file %s/%u/seq-%06u (err=%ld)",
 		       zone_no, zlo->base_dir, zlo->id, zone_no,
 		       PTR_ERR(zone->file));
 		return PTR_ERR(zone->file);
 	}

 	if (!zlo->block_size) {
 		ret = zloop_get_block_size(zlo, zone);
 		if (ret)
 			return ret;
 	}

 	zloop_get_block_size(zlo, zone);

 	mutex_lock(&zone->lock);
 	ret = zloop_update_seq_zone(zlo, zone_no);
 	mutex_unlock(&zone->lock);

 	return ret;
 }

 static bool zloop_dev_exists(struct zloop_device *zlo)
 {
 	struct file *cnv, *seq;
 	bool exists;

 	cnv = zloop_filp_open_fmt(O_RDONLY, 0600, "%s/%u/cnv-%06u",
 				  zlo->base_dir, zlo->id, 0);
 	seq = zloop_filp_open_fmt(O_RDONLY, 0600, "%s/%u/seq-%06u",
 				  zlo->base_dir, zlo->id, 0);
 	exists = !IS_ERR(cnv) || !IS_ERR(seq);

 	if (!IS_ERR(cnv))
 		fput(cnv);
 	if (!IS_ERR(seq))
 		fput(seq);

 	return exists;
 }

 static int zloop_ctl_add(struct zloop_options *opts)
 {
 	struct queue_limits lim = {
 		.max_hw_sectors		= SZ_1M >> SECTOR_SHIFT,
 		.max_hw_zone_append_sectors = SZ_1M >> SECTOR_SHIFT,
 		.chunk_sectors		= opts->zone_size,
 		.features		= BLK_FEAT_ZONED,
 	};
 	unsigned int nr_zones, i, j;
 	struct zloop_device *zlo;
 	int ret = -EINVAL;
 	bool restore;

 	__module_get(THIS_MODULE);

 	nr_zones = opts->capacity >> ilog2(opts->zone_size);
 	if (opts->nr_conv_zones >= nr_zones) {
 		pr_err("Invalid number of conventional zones %u\n",
 		       opts->nr_conv_zones);
 		goto out;
 	}

 	zlo = kvzalloc(struct_size(zlo, zones, nr_zones), GFP_KERNEL);
 	if (!zlo) {
 		ret = -ENOMEM;
 		goto out;
 	}
 	zlo->state = Zlo_creating;

 	ret = mutex_lock_killable(&zloop_ctl_mutex);
 	if (ret)
 		goto out_free_dev;

 	/* Allocate id, if @opts->id >= 0, we're requesting that specific id */
 	if (opts->id >= 0) {
 		ret = idr_alloc(&zloop_index_idr, zlo,
 				  opts->id, opts->id + 1, GFP_KERNEL);
 		if (ret == -ENOSPC)
 			ret = -EEXIST;
 	} else {
 		ret = idr_alloc(&zloop_index_idr, zlo, 0, 0, GFP_KERNEL);
 	}
 	mutex_unlock(&zloop_ctl_mutex);
 	if (ret < 0)
 		goto out_free_dev;

 	zlo->id = ret;
 	zlo->zone_shift = ilog2(opts->zone_size);
 	zlo->zone_size = opts->zone_size;
 	if (opts->zone_capacity)
 		zlo->zone_capacity = opts->zone_capacity;
 	else
 		zlo->zone_capacity = zlo->zone_size;
 	zlo->nr_zones = nr_zones;
 	zlo->nr_conv_zones = opts->nr_conv_zones;
 	zlo->buffered_io = opts->buffered_io;

 	zlo->workqueue = alloc_workqueue("zloop%d", WQ_UNBOUND | WQ_FREEZABLE,
 				opts->nr_queues * opts->queue_depth, zlo->id);
 	if (!zlo->workqueue) {
 		ret = -ENOMEM;
 		goto out_free_idr;
 	}

 	if (opts->base_dir)
 		zlo->base_dir = kstrdup(opts->base_dir, GFP_KERNEL);
 	else
 		zlo->base_dir = kstrdup(ZLOOP_DEF_BASE_DIR, GFP_KERNEL);
 	if (!zlo->base_dir) {
 		ret = -ENOMEM;
 		goto out_destroy_workqueue;
 	}

 	zlo->data_dir = zloop_filp_open_fmt(O_RDONLY | O_DIRECTORY, 0, "%s/%u",
 					    zlo->base_dir, zlo->id);
 	if (IS_ERR(zlo->data_dir)) {
 		ret = PTR_ERR(zlo->data_dir);
 		pr_warn("Failed to open directory %s/%u (err=%d)\n",
 			zlo->base_dir, zlo->id, ret);
 		goto out_free_base_dir;
 	}

 	/*
 	 * If we already have zone files, we are restoring a device created by a
 	 * previous add operation. In this case, zloop_init_zone() will check
 	 * that the zone files are consistent with the zone configuration given.
 	 */
 	restore = zloop_dev_exists(zlo);
 	for (i = 0; i < nr_zones; i++) {
 		ret = zloop_init_zone(zlo, opts, i, restore);
 		if (ret)
 			goto out_close_files;
 	}

 	lim.physical_block_size = zlo->block_size;
 	lim.logical_block_size = zlo->block_size;

 	zlo->tag_set.ops = &zloop_mq_ops;
 	zlo->tag_set.nr_hw_queues = opts->nr_queues;
 	zlo->tag_set.queue_depth = opts->queue_depth;
 	zlo->tag_set.numa_node = NUMA_NO_NODE;
 	zlo->tag_set.cmd_size = sizeof(struct zloop_cmd);
 	zlo->tag_set.driver_data = zlo;

 	ret = blk_mq_alloc_tag_set(&zlo->tag_set);
 	if (ret) {
 		pr_err("blk_mq_alloc_tag_set failed (err=%d)\n", ret);
 		goto out_close_files;
 	}

 	zlo->disk = blk_mq_alloc_disk(&zlo->tag_set, &lim, zlo);
 	if (IS_ERR(zlo->disk)) {
 		pr_err("blk_mq_alloc_disk failed (err=%d)\n", ret);
 		ret = PTR_ERR(zlo->disk);
 		goto out_cleanup_tags;
 	}
 	zlo->disk->flags = GENHD_FL_NO_PART;
 	zlo->disk->fops = &zloop_fops;
 	zlo->disk->private_data = zlo;
 	sprintf(zlo->disk->disk_name, "zloop%d", zlo->id);
 	set_capacity(zlo->disk, (u64)lim.chunk_sectors * zlo->nr_zones);

 	ret = blk_revalidate_disk_zones(zlo->disk);
 	if (ret)
 		goto out_cleanup_disk;

 	ret = add_disk(zlo->disk);
 	if (ret) {
 		pr_err("add_disk failed (err=%d)\n", ret);
 		goto out_cleanup_disk;
 	}

 	mutex_lock(&zloop_ctl_mutex);
 	zlo->state = Zlo_live;
 	mutex_unlock(&zloop_ctl_mutex);

 	pr_info("Added device %d: %u zones of %llu MB, %u B block size\n",
 		zlo->id, zlo->nr_zones,
 		((sector_t)zlo->zone_size << SECTOR_SHIFT) >> 20,
 		zlo->block_size);

 	return 0;

 out_cleanup_disk:
 	put_disk(zlo->disk);
 out_cleanup_tags:
 	blk_mq_free_tag_set(&zlo->tag_set);
 out_close_files:
 	for (j = 0; j < i; j++) {
 		struct zloop_zone *zone = &zlo->zones[j];

 		if (!IS_ERR_OR_NULL(zone->file))
 			fput(zone->file);
 	}
 	fput(zlo->data_dir);
 out_free_base_dir:
 	kfree(zlo->base_dir);
 out_destroy_workqueue:
 	destroy_workqueue(zlo->workqueue);
 out_free_idr:
 	mutex_lock(&zloop_ctl_mutex);
 	idr_remove(&zloop_index_idr, zlo->id);
 	mutex_unlock(&zloop_ctl_mutex);
 out_free_dev:
 	kvfree(zlo);
 out:
 	module_put(THIS_MODULE);
 	if (ret == -ENOENT)
 		ret = -EINVAL;
 	return ret;
 }

 static int zloop_ctl_remove(struct zloop_options *opts)
 {
 	struct zloop_device *zlo;
 	int ret;

 	if (!(opts->mask & ZLOOP_OPT_ID)) {
 		pr_err("No ID specified\n");
 		return -EINVAL;
 	}

 	ret = mutex_lock_killable(&zloop_ctl_mutex);
 	if (ret)
 		return ret;

 	zlo = idr_find(&zloop_index_idr, opts->id);
 	if (!zlo || zlo->state == Zlo_creating) {
 		ret = -ENODEV;
 	} else if (zlo->state == Zlo_deleting) {
 		ret = -EINVAL;
 	} else {
 		idr_remove(&zloop_index_idr, zlo->id);
 		zlo->state = Zlo_deleting;
 	}

 	mutex_unlock(&zloop_ctl_mutex);
 	if (ret)
 		return ret;

 	del_gendisk(zlo->disk);
 	put_disk(zlo->disk);
 	blk_mq_free_tag_set(&zlo->tag_set);

 	pr_info("Removed device %d\n", opts->id);

 	module_put(THIS_MODULE);

 	return 0;
 }

 static int zloop_parse_options(struct zloop_options *opts, const char *buf)
 {
 	substring_t args[MAX_OPT_ARGS];
 	char *options, *o, *p;
 	unsigned int token;
 	int ret = 0;

 	/* Set defaults. */
 	opts->mask = 0;
 	opts->id = ZLOOP_DEF_ID;
 	opts->capacity = ZLOOP_DEF_ZONE_SIZE * ZLOOP_DEF_NR_ZONES;
 	opts->zone_size = ZLOOP_DEF_ZONE_SIZE;
 	opts->nr_conv_zones = ZLOOP_DEF_NR_CONV_ZONES;
 	opts->nr_queues = ZLOOP_DEF_NR_QUEUES;
 	opts->queue_depth = ZLOOP_DEF_QUEUE_DEPTH;
 	opts->buffered_io = ZLOOP_DEF_BUFFERED_IO;

 	if (!buf)
 		return 0;

 	/* Skip leading spaces before the options. */
 	while (isspace(*buf))
 		buf++;

 	options = o = kstrdup(buf, GFP_KERNEL);
 	if (!options)
 		return -ENOMEM;

 	/* Parse the options, doing only some light invalid value checks. */
 	while ((p = strsep(&o, ",\n")) != NULL) {
 		if (!*p)
 			continue;

 		token = match_token(p, zloop_opt_tokens, args);
 		opts->mask |= token;
 		switch (token) {
 		case ZLOOP_OPT_ID:
 			if (match_int(args, &opts->id)) {
 				ret = -EINVAL;
 				goto out;
 			}
 			break;
 		case ZLOOP_OPT_CAPACITY:
 			if (match_uint(args, &token)) {
 				ret = -EINVAL;
 				goto out;
 			}
 			if (!token) {
 				pr_err("Invalid capacity\n");
 				ret = -EINVAL;
 				goto out;
 			}
 			opts->capacity =
 				((sector_t)token * SZ_1M) >> SECTOR_SHIFT;
 			break;
 		case ZLOOP_OPT_ZONE_SIZE:
 			if (match_uint(args, &token)) {
 				ret = -EINVAL;
 				goto out;
 			}
 			if (!token || token > ZLOOP_MAX_ZONE_SIZE_MB ||
 			    !is_power_of_2(token)) {
 				pr_err("Invalid zone size %u\n", token);
 				ret = -EINVAL;
 				goto out;
 			}
 			opts->zone_size =
 				((sector_t)token * SZ_1M) >> SECTOR_SHIFT;
 			break;
 		case ZLOOP_OPT_ZONE_CAPACITY:
 			if (match_uint(args, &token)) {
 				ret = -EINVAL;
 				goto out;
 			}
 			if (!token) {
 				pr_err("Invalid zone capacity\n");
 				ret = -EINVAL;
 				goto out;
 			}
 			opts->zone_capacity =
 				((sector_t)token * SZ_1M) >> SECTOR_SHIFT;
 			break;
 		case ZLOOP_OPT_NR_CONV_ZONES:
 			if (match_uint(args, &token)) {
 				ret = -EINVAL;
 				goto out;
 			}
 			opts->nr_conv_zones = token;
 			break;
 		case ZLOOP_OPT_BASE_DIR:
 			p = match_strdup(args);
 			if (!p) {
 				ret = -ENOMEM;
 				goto out;
 			}
 			kfree(opts->base_dir);
 			opts->base_dir = p;
 			break;
 		case ZLOOP_OPT_NR_QUEUES:
 			if (match_uint(args, &token)) {
 				ret = -EINVAL;
 				goto out;
 			}
 			if (!token) {
 				pr_err("Invalid number of queues\n");
 				ret = -EINVAL;
 				goto out;
 			}
 			opts->nr_queues = min(token, num_online_cpus());
 			break;
 		case ZLOOP_OPT_QUEUE_DEPTH:
 			if (match_uint(args, &token)) {
 				ret = -EINVAL;
 				goto out;
 			}
 			if (!token) {
 				pr_err("Invalid queue depth\n");
 				ret = -EINVAL;
 				goto out;
 			}
 			opts->queue_depth = token;
 			break;
 		case ZLOOP_OPT_BUFFERED_IO:
 			opts->buffered_io = true;
 			break;
 		case ZLOOP_OPT_ERR:
 		default:
 			pr_warn("unknown parameter or missing value '%s'\n", p);
 			ret = -EINVAL;
 			goto out;
 		}
 	}

 	ret = -EINVAL;
 	if (opts->capacity <= opts->zone_size) {
 		pr_err("Invalid capacity\n");
 		goto out;
 	}

 	if (opts->zone_capacity > opts->zone_size) {
 		pr_err("Invalid zone capacity\n");
 		goto out;
 	}

 	ret = 0;
 out:
 	kfree(options);
 	return ret;
 }

 enum {
 	ZLOOP_CTL_ADD,
 	ZLOOP_CTL_REMOVE,
 };

 static struct zloop_ctl_op {
 	int		code;
 	const char	*name;
 } zloop_ctl_ops[] = {
 	{ ZLOOP_CTL_ADD,	"add" },
 	{ ZLOOP_CTL_REMOVE,	"remove" },
 	{ -1,	NULL },
 };

 static ssize_t zloop_ctl_write(struct file *file, const char __user *ubuf,
 			       size_t count, loff_t *pos)
 {
 	struct zloop_options opts = { };
 	struct zloop_ctl_op *op;
 	const char *buf, *opts_buf;
 	int i, ret;

 	if (count > PAGE_SIZE)
 		return -ENOMEM;

 	buf = memdup_user_nul(ubuf, count);
 	if (IS_ERR(buf))
 		return PTR_ERR(buf);

 	for (i = 0; i < ARRAY_SIZE(zloop_ctl_ops); i++) {
 		op = &zloop_ctl_ops[i];
 		if (!op->name) {
 			pr_err("Invalid operation\n");
 			ret = -EINVAL;
 			goto out;
 		}
 		if (!strncmp(buf, op->name, strlen(op->name)))
 			break;
 	}

 	if (count <= strlen(op->name))
 		opts_buf = NULL;
 	else
 		opts_buf = buf + strlen(op->name);

 	ret = zloop_parse_options(&opts, opts_buf);
 	if (ret) {
 		pr_err("Failed to parse options\n");
 		goto out;
 	}

 	switch (op->code) {
 	case ZLOOP_CTL_ADD:
 		ret = zloop_ctl_add(&opts);
 		break;
 	case ZLOOP_CTL_REMOVE:
 		ret = zloop_ctl_remove(&opts);
 		break;
 	default:
 		pr_err("Invalid operation\n");
 		ret = -EINVAL;
 		goto out;
 	}

 out:
 	kfree(opts.base_dir);
 	kfree(buf);
 	return ret ? ret : count;
 }

 static int zloop_ctl_show(struct seq_file *seq_file, void *private)
 {
 	const struct match_token *tok;
 	int i;

 	/* Add operation */
 	seq_printf(seq_file, "%s ", zloop_ctl_ops[0].name);
 	for (i = 0; i < ARRAY_SIZE(zloop_opt_tokens); i++) {
 		tok = &zloop_opt_tokens[i];
 		if (!tok->pattern)
 			break;
 		if (i)
 			seq_putc(seq_file, ',');
 		seq_puts(seq_file, tok->pattern);
 	}
 	seq_putc(seq_file, '\n');

 	/* Remove operation */
 	seq_puts(seq_file, zloop_ctl_ops[1].name);
 	seq_puts(seq_file, " id=%d\n");

 	return 0;
 }

 static int zloop_ctl_open(struct inode *inode, struct file *file)
 {
 	file->private_data = NULL;
 	return single_open(file, zloop_ctl_show, NULL);
 }

 static int zloop_ctl_release(struct inode *inode, struct file *file)
 {
 	return single_release(inode, file);
 }

 static const struct file_operations zloop_ctl_fops = {
 	.owner		= THIS_MODULE,
 	.open		= zloop_ctl_open,
 	.release	= zloop_ctl_release,
 	.write		= zloop_ctl_write,
 	.read		= seq_read,
 };

 static struct miscdevice zloop_misc = {
 	.minor		= MISC_DYNAMIC_MINOR,
 	.name		= "zloop-control",
 	.fops		= &zloop_ctl_fops,
 };

 static int __init zloop_init(void)
 {
 	int ret;

 	ret = misc_register(&zloop_misc);
 	if (ret) {
 		pr_err("Failed to register misc device: %d\n", ret);
 		return ret;
 	}
 	pr_info("Module loaded\n");

 	return 0;
 }

 static void __exit zloop_exit(void)
 {
 	misc_deregister(&zloop_misc);
 	idr_destroy(&zloop_index_idr);
 }

 module_init(zloop_init);
 module_exit(zloop_exit);

 MODULE_DESCRIPTION("Zoned loopback device");
 MODULE_LICENSE("GPL");