blob: 553b1a713ab9154bd5ee5258a17959efbb7e11b8 [file] [log] [blame]
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2025, Christoph Hellwig.
* Copyright (c) 2025, Western Digital Corporation or its affiliates.
*
* Zoned Loop Device driver - exports a zoned block device using one file per
* zone as backing storage.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/blk-mq.h>
#include <linux/blkzoned.h>
#include <linux/pagemap.h>
#include <linux/miscdevice.h>
#include <linux/falloc.h>
#include <linux/mutex.h>
#include <linux/parser.h>
#include <linux/seq_file.h>
/*
* Options for adding (and removing) a device.
*/
enum {
ZLOOP_OPT_ERR = 0,
ZLOOP_OPT_ID = (1 << 0),
ZLOOP_OPT_CAPACITY = (1 << 1),
ZLOOP_OPT_ZONE_SIZE = (1 << 2),
ZLOOP_OPT_ZONE_CAPACITY = (1 << 3),
ZLOOP_OPT_NR_CONV_ZONES = (1 << 4),
ZLOOP_OPT_BASE_DIR = (1 << 5),
ZLOOP_OPT_NR_QUEUES = (1 << 6),
ZLOOP_OPT_QUEUE_DEPTH = (1 << 7),
ZLOOP_OPT_BUFFERED_IO = (1 << 8),
};
static const match_table_t zloop_opt_tokens = {
{ ZLOOP_OPT_ID, "id=%d" },
{ ZLOOP_OPT_CAPACITY, "capacity_mb=%u" },
{ ZLOOP_OPT_ZONE_SIZE, "zone_size_mb=%u" },
{ ZLOOP_OPT_ZONE_CAPACITY, "zone_capacity_mb=%u" },
{ ZLOOP_OPT_NR_CONV_ZONES, "conv_zones=%u" },
{ ZLOOP_OPT_BASE_DIR, "base_dir=%s" },
{ ZLOOP_OPT_NR_QUEUES, "nr_queues=%u" },
{ ZLOOP_OPT_QUEUE_DEPTH, "queue_depth=%u" },
{ ZLOOP_OPT_BUFFERED_IO, "buffered_io" },
{ ZLOOP_OPT_ERR, NULL }
};
/* Default values for the "add" operation. */
#define ZLOOP_DEF_ID -1
#define ZLOOP_DEF_ZONE_SIZE ((256ULL * SZ_1M) >> SECTOR_SHIFT)
#define ZLOOP_DEF_NR_ZONES 64
#define ZLOOP_DEF_NR_CONV_ZONES 8
#define ZLOOP_DEF_BASE_DIR "/var/local/zloop"
#define ZLOOP_DEF_NR_QUEUES 1
#define ZLOOP_DEF_QUEUE_DEPTH 128
#define ZLOOP_DEF_BUFFERED_IO false
/* Arbitrary limit on the zone size (16GB). */
#define ZLOOP_MAX_ZONE_SIZE_MB 16384
struct zloop_options {
unsigned int mask;
int id;
sector_t capacity;
sector_t zone_size;
sector_t zone_capacity;
unsigned int nr_conv_zones;
char *base_dir;
unsigned int nr_queues;
unsigned int queue_depth;
bool buffered_io;
};
/*
* Device states.
*/
enum {
Zlo_creating = 0,
Zlo_live,
Zlo_deleting,
};
enum zloop_zone_flags {
ZLOOP_ZONE_CONV = 0,
ZLOOP_ZONE_SEQ_ERROR,
};
struct zloop_zone {
struct file *file;
unsigned long flags;
struct mutex lock;
enum blk_zone_cond cond;
sector_t start;
sector_t wp;
gfp_t old_gfp_mask;
};
struct zloop_device {
unsigned int id;
unsigned int state;
struct blk_mq_tag_set tag_set;
struct gendisk *disk;
struct workqueue_struct *workqueue;
bool buffered_io;
const char *base_dir;
struct file *data_dir;
unsigned int zone_shift;
sector_t zone_size;
sector_t zone_capacity;
unsigned int nr_zones;
unsigned int nr_conv_zones;
unsigned int block_size;
struct zloop_zone zones[] __counted_by(nr_zones);
};
struct zloop_cmd {
struct work_struct work;
atomic_t ref;
sector_t sector;
sector_t nr_sectors;
long ret;
struct kiocb iocb;
struct bio_vec *bvec;
};
static DEFINE_IDR(zloop_index_idr);
static DEFINE_MUTEX(zloop_ctl_mutex);
static unsigned int rq_zone_no(struct request *rq)
{
struct zloop_device *zlo = rq->q->queuedata;
return blk_rq_pos(rq) >> zlo->zone_shift;
}
static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no)
{
struct zloop_zone *zone = &zlo->zones[zone_no];
struct kstat stat;
sector_t file_sectors;
int ret;
lockdep_assert_held(&zone->lock);
ret = vfs_getattr(&zone->file->f_path, &stat, STATX_SIZE, 0);
if (ret < 0) {
pr_err("Failed to get zone %u file stat (err=%d)\n",
zone_no, ret);
set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
return ret;
}
file_sectors = stat.size >> SECTOR_SHIFT;
if (file_sectors > zlo->zone_capacity) {
pr_err("Zone %u file too large (%llu sectors > %llu)\n",
zone_no, file_sectors, zlo->zone_capacity);
return -EINVAL;
}
if (file_sectors & ((zlo->block_size >> SECTOR_SHIFT) - 1)) {
pr_err("Zone %u file size not aligned to block size %u\n",
zone_no, zlo->block_size);
return -EINVAL;
}
if (!file_sectors) {
zone->cond = BLK_ZONE_COND_EMPTY;
zone->wp = zone->start;
} else if (file_sectors == zlo->zone_capacity) {
zone->cond = BLK_ZONE_COND_FULL;
zone->wp = zone->start + zlo->zone_size;
} else {
zone->cond = BLK_ZONE_COND_CLOSED;
zone->wp = zone->start + file_sectors;
}
return 0;
}
static int zloop_open_zone(struct zloop_device *zlo, unsigned int zone_no)
{
struct zloop_zone *zone = &zlo->zones[zone_no];
int ret = 0;
if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
return -EIO;
mutex_lock(&zone->lock);
if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
ret = zloop_update_seq_zone(zlo, zone_no);
if (ret)
goto unlock;
}
switch (zone->cond) {
case BLK_ZONE_COND_EXP_OPEN:
break;
case BLK_ZONE_COND_EMPTY:
case BLK_ZONE_COND_CLOSED:
case BLK_ZONE_COND_IMP_OPEN:
zone->cond = BLK_ZONE_COND_EXP_OPEN;
break;
case BLK_ZONE_COND_FULL:
default:
ret = -EIO;
break;
}
unlock:
mutex_unlock(&zone->lock);
return ret;
}
static int zloop_close_zone(struct zloop_device *zlo, unsigned int zone_no)
{
struct zloop_zone *zone = &zlo->zones[zone_no];
int ret = 0;
if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
return -EIO;
mutex_lock(&zone->lock);
if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
ret = zloop_update_seq_zone(zlo, zone_no);
if (ret)
goto unlock;
}
switch (zone->cond) {
case BLK_ZONE_COND_CLOSED:
break;
case BLK_ZONE_COND_IMP_OPEN:
case BLK_ZONE_COND_EXP_OPEN:
if (zone->wp == zone->start)
zone->cond = BLK_ZONE_COND_EMPTY;
else
zone->cond = BLK_ZONE_COND_CLOSED;
break;
case BLK_ZONE_COND_EMPTY:
case BLK_ZONE_COND_FULL:
default:
ret = -EIO;
break;
}
unlock:
mutex_unlock(&zone->lock);
return ret;
}
static int zloop_reset_zone(struct zloop_device *zlo, unsigned int zone_no)
{
struct zloop_zone *zone = &zlo->zones[zone_no];
int ret = 0;
if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
return -EIO;
mutex_lock(&zone->lock);
if (!test_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags) &&
zone->cond == BLK_ZONE_COND_EMPTY)
goto unlock;
if (vfs_truncate(&zone->file->f_path, 0)) {
set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
ret = -EIO;
goto unlock;
}
zone->cond = BLK_ZONE_COND_EMPTY;
zone->wp = zone->start;
clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
unlock:
mutex_unlock(&zone->lock);
return ret;
}
static int zloop_reset_all_zones(struct zloop_device *zlo)
{
unsigned int i;
int ret;
for (i = zlo->nr_conv_zones; i < zlo->nr_zones; i++) {
ret = zloop_reset_zone(zlo, i);
if (ret)
return ret;
}
return 0;
}
static int zloop_finish_zone(struct zloop_device *zlo, unsigned int zone_no)
{
struct zloop_zone *zone = &zlo->zones[zone_no];
int ret = 0;
if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
return -EIO;
mutex_lock(&zone->lock);
if (!test_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags) &&
zone->cond == BLK_ZONE_COND_FULL)
goto unlock;
if (vfs_truncate(&zone->file->f_path, zlo->zone_size << SECTOR_SHIFT)) {
set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
ret = -EIO;
goto unlock;
}
zone->cond = BLK_ZONE_COND_FULL;
zone->wp = zone->start + zlo->zone_size;
clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
unlock:
mutex_unlock(&zone->lock);
return ret;
}
static void zloop_put_cmd(struct zloop_cmd *cmd)
{
struct request *rq = blk_mq_rq_from_pdu(cmd);
if (!atomic_dec_and_test(&cmd->ref))
return;
kfree(cmd->bvec);
cmd->bvec = NULL;
if (likely(!blk_should_fake_timeout(rq->q)))
blk_mq_complete_request(rq);
}
static void zloop_rw_complete(struct kiocb *iocb, long ret)
{
struct zloop_cmd *cmd = container_of(iocb, struct zloop_cmd, iocb);
cmd->ret = ret;
zloop_put_cmd(cmd);
}
static void zloop_rw(struct zloop_cmd *cmd)
{
struct request *rq = blk_mq_rq_from_pdu(cmd);
struct zloop_device *zlo = rq->q->queuedata;
unsigned int zone_no = rq_zone_no(rq);
sector_t sector = blk_rq_pos(rq);
sector_t nr_sectors = blk_rq_sectors(rq);
bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND;
bool is_write = req_op(rq) == REQ_OP_WRITE || is_append;
int rw = is_write ? ITER_SOURCE : ITER_DEST;
struct req_iterator rq_iter;
struct zloop_zone *zone;
struct iov_iter iter;
struct bio_vec tmp;
sector_t zone_end;
int nr_bvec = 0;
int ret;
atomic_set(&cmd->ref, 2);
cmd->sector = sector;
cmd->nr_sectors = nr_sectors;
cmd->ret = 0;
/* We should never get an I/O beyond the device capacity. */
if (WARN_ON_ONCE(zone_no >= zlo->nr_zones)) {
ret = -EIO;
goto out;
}
zone = &zlo->zones[zone_no];
zone_end = zone->start + zlo->zone_capacity;
/*
* The block layer should never send requests that are not fully
* contained within the zone.
*/
if (WARN_ON_ONCE(sector + nr_sectors > zone->start + zlo->zone_size)) {
ret = -EIO;
goto out;
}
if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
mutex_lock(&zone->lock);
ret = zloop_update_seq_zone(zlo, zone_no);
mutex_unlock(&zone->lock);
if (ret)
goto out;
}
if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) {
mutex_lock(&zone->lock);
if (is_append) {
sector = zone->wp;
cmd->sector = sector;
}
/*
* Write operations must be aligned to the write pointer and
* fully contained within the zone capacity.
*/
if (sector != zone->wp || zone->wp + nr_sectors > zone_end) {
pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n",
zone_no, sector, zone->wp);
ret = -EIO;
goto unlock;
}
/* Implicitly open the target zone. */
if (zone->cond == BLK_ZONE_COND_CLOSED ||
zone->cond == BLK_ZONE_COND_EMPTY)
zone->cond = BLK_ZONE_COND_IMP_OPEN;
/*
* Advance the write pointer of sequential zones. If the write
* fails, the wp position will be corrected when the next I/O
* copmpletes.
*/
zone->wp += nr_sectors;
if (zone->wp == zone_end)
zone->cond = BLK_ZONE_COND_FULL;
}
rq_for_each_bvec(tmp, rq, rq_iter)
nr_bvec++;
if (rq->bio != rq->biotail) {
struct bio_vec *bvec;
cmd->bvec = kmalloc_array(nr_bvec, sizeof(*cmd->bvec), GFP_NOIO);
if (!cmd->bvec) {
ret = -EIO;
goto unlock;
}
/*
* The bios of the request may be started from the middle of
* the 'bvec' because of bio splitting, so we can't directly
* copy bio->bi_iov_vec to new bvec. The rq_for_each_bvec
* API will take care of all details for us.
*/
bvec = cmd->bvec;
rq_for_each_bvec(tmp, rq, rq_iter) {
*bvec = tmp;
bvec++;
}
iov_iter_bvec(&iter, rw, cmd->bvec, nr_bvec, blk_rq_bytes(rq));
} else {
/*
* Same here, this bio may be started from the middle of the
* 'bvec' because of bio splitting, so offset from the bvec
* must be passed to iov iterator
*/
iov_iter_bvec(&iter, rw,
__bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter),
nr_bvec, blk_rq_bytes(rq));
iter.iov_offset = rq->bio->bi_iter.bi_bvec_done;
}
cmd->iocb.ki_pos = (sector - zone->start) << SECTOR_SHIFT;
cmd->iocb.ki_filp = zone->file;
cmd->iocb.ki_complete = zloop_rw_complete;
if (!zlo->buffered_io)
cmd->iocb.ki_flags = IOCB_DIRECT;
cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
if (rw == ITER_SOURCE)
ret = zone->file->f_op->write_iter(&cmd->iocb, &iter);
else
ret = zone->file->f_op->read_iter(&cmd->iocb, &iter);
unlock:
if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write)
mutex_unlock(&zone->lock);
out:
if (ret != -EIOCBQUEUED)
zloop_rw_complete(&cmd->iocb, ret);
zloop_put_cmd(cmd);
}
static void zloop_handle_cmd(struct zloop_cmd *cmd)
{
struct request *rq = blk_mq_rq_from_pdu(cmd);
struct zloop_device *zlo = rq->q->queuedata;
switch (req_op(rq)) {
case REQ_OP_READ:
case REQ_OP_WRITE:
case REQ_OP_ZONE_APPEND:
/*
* zloop_rw() always executes asynchronously or completes
* directly.
*/
zloop_rw(cmd);
return;
case REQ_OP_FLUSH:
/*
* Sync the entire FS containing the zone files instead of
* walking all files
*/
cmd->ret = sync_filesystem(file_inode(zlo->data_dir)->i_sb);
break;
case REQ_OP_ZONE_RESET:
cmd->ret = zloop_reset_zone(zlo, rq_zone_no(rq));
break;
case REQ_OP_ZONE_RESET_ALL:
cmd->ret = zloop_reset_all_zones(zlo);
break;
case REQ_OP_ZONE_FINISH:
cmd->ret = zloop_finish_zone(zlo, rq_zone_no(rq));
break;
case REQ_OP_ZONE_OPEN:
cmd->ret = zloop_open_zone(zlo, rq_zone_no(rq));
break;
case REQ_OP_ZONE_CLOSE:
cmd->ret = zloop_close_zone(zlo, rq_zone_no(rq));
break;
default:
WARN_ON_ONCE(1);
pr_err("Unsupported operation %d\n", req_op(rq));
cmd->ret = -EOPNOTSUPP;
break;
}
blk_mq_complete_request(rq);
}
static void zloop_cmd_workfn(struct work_struct *work)
{
struct zloop_cmd *cmd = container_of(work, struct zloop_cmd, work);
int orig_flags = current->flags;
current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
zloop_handle_cmd(cmd);
current->flags = orig_flags;
}
static void zloop_complete_rq(struct request *rq)
{
struct zloop_cmd *cmd = blk_mq_rq_to_pdu(rq);
struct zloop_device *zlo = rq->q->queuedata;
unsigned int zone_no = cmd->sector >> zlo->zone_shift;
struct zloop_zone *zone = &zlo->zones[zone_no];
blk_status_t sts = BLK_STS_OK;
switch (req_op(rq)) {
case REQ_OP_READ:
if (cmd->ret < 0)
pr_err("Zone %u: failed read sector %llu, %llu sectors\n",
zone_no, cmd->sector, cmd->nr_sectors);
if (cmd->ret >= 0 && cmd->ret != blk_rq_bytes(rq)) {
/* short read */
struct bio *bio;
__rq_for_each_bio(bio, rq)
zero_fill_bio(bio);
}
break;
case REQ_OP_WRITE:
case REQ_OP_ZONE_APPEND:
if (cmd->ret < 0)
pr_err("Zone %u: failed %swrite sector %llu, %llu sectors\n",
zone_no,
req_op(rq) == REQ_OP_WRITE ? "" : "append ",
cmd->sector, cmd->nr_sectors);
if (cmd->ret >= 0 && cmd->ret != blk_rq_bytes(rq)) {
pr_err("Zone %u: partial write %ld/%u B\n",
zone_no, cmd->ret, blk_rq_bytes(rq));
cmd->ret = -EIO;
}
if (cmd->ret < 0 && !test_bit(ZLOOP_ZONE_CONV, &zone->flags)) {
/*
* A write to a sequential zone file failed: mark the
* zone as having an error. This will be corrected and
* cleared when the next IO is submitted.
*/
set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
break;
}
if (req_op(rq) == REQ_OP_ZONE_APPEND)
rq->__sector = cmd->sector;
break;
default:
break;
}
if (cmd->ret < 0)
sts = errno_to_blk_status(cmd->ret);
blk_mq_end_request(rq, sts);
}
static blk_status_t zloop_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
struct request *rq = bd->rq;
struct zloop_cmd *cmd = blk_mq_rq_to_pdu(rq);
struct zloop_device *zlo = rq->q->queuedata;
if (zlo->state == Zlo_deleting)
return BLK_STS_IOERR;
blk_mq_start_request(rq);
INIT_WORK(&cmd->work, zloop_cmd_workfn);
queue_work(zlo->workqueue, &cmd->work);
return BLK_STS_OK;
}
static const struct blk_mq_ops zloop_mq_ops = {
.queue_rq = zloop_queue_rq,
.complete = zloop_complete_rq,
};
static int zloop_open(struct gendisk *disk, blk_mode_t mode)
{
struct zloop_device *zlo = disk->private_data;
int ret;
ret = mutex_lock_killable(&zloop_ctl_mutex);
if (ret)
return ret;
if (zlo->state != Zlo_live)
ret = -ENXIO;
mutex_unlock(&zloop_ctl_mutex);
return ret;
}
static int zloop_report_zones(struct gendisk *disk, sector_t sector,
unsigned int nr_zones, report_zones_cb cb, void *data)
{
struct zloop_device *zlo = disk->private_data;
struct blk_zone blkz = {};
unsigned int first, i;
int ret;
first = disk_zone_no(disk, sector);
if (first >= zlo->nr_zones)
return 0;
nr_zones = min(nr_zones, zlo->nr_zones - first);
for (i = 0; i < nr_zones; i++) {
unsigned int zone_no = first + i;
struct zloop_zone *zone = &zlo->zones[zone_no];
mutex_lock(&zone->lock);
if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
ret = zloop_update_seq_zone(zlo, zone_no);
if (ret) {
mutex_unlock(&zone->lock);
return ret;
}
}
blkz.start = zone->start;
blkz.len = zlo->zone_size;
blkz.wp = zone->wp;
blkz.cond = zone->cond;
if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) {
blkz.type = BLK_ZONE_TYPE_CONVENTIONAL;
blkz.capacity = zlo->zone_size;
} else {
blkz.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
blkz.capacity = zlo->zone_capacity;
}
mutex_unlock(&zone->lock);
ret = cb(&blkz, i, data);
if (ret)
return ret;
}
return nr_zones;
}
static void zloop_free_disk(struct gendisk *disk)
{
struct zloop_device *zlo = disk->private_data;
unsigned int i;
for (i = 0; i < zlo->nr_zones; i++) {
struct zloop_zone *zone = &zlo->zones[i];
mapping_set_gfp_mask(zone->file->f_mapping,
zone->old_gfp_mask);
fput(zone->file);
}
fput(zlo->data_dir);
destroy_workqueue(zlo->workqueue);
kfree(zlo->base_dir);
kvfree(zlo);
}
static const struct block_device_operations zloop_fops = {
.owner = THIS_MODULE,
.open = zloop_open,
.report_zones = zloop_report_zones,
.free_disk = zloop_free_disk,
};
__printf(3, 4)
static struct file *zloop_filp_open_fmt(int oflags, umode_t mode,
const char *fmt, ...)
{
struct file *file;
va_list ap;
char *p;
va_start(ap, fmt);
p = kvasprintf(GFP_KERNEL, fmt, ap);
va_end(ap);
if (!p)
return ERR_PTR(-ENOMEM);
file = filp_open(p, oflags, mode);
kfree(p);
return file;
}
static int zloop_get_block_size(struct zloop_device *zlo,
struct zloop_zone *zone)
{
struct block_device *sb_bdev = zone->file->f_mapping->host->i_sb->s_bdev;
struct kstat st;
/*
* If the FS block size is lower than or equal to 4K, use that as the
* device block size. Otherwise, fallback to the FS direct IO alignment
* constraint if that is provided, and to the FS underlying device
* physical block size if the direct IO alignment is unknown.
*/
if (file_inode(zone->file)->i_sb->s_blocksize <= SZ_4K)
zlo->block_size = file_inode(zone->file)->i_sb->s_blocksize;
else if (!vfs_getattr(&zone->file->f_path, &st, STATX_DIOALIGN, 0) &&
(st.result_mask & STATX_DIOALIGN))
zlo->block_size = st.dio_offset_align;
else if (sb_bdev)
zlo->block_size = bdev_physical_block_size(sb_bdev);
else
zlo->block_size = SECTOR_SIZE;
if (zlo->zone_capacity & ((zlo->block_size >> SECTOR_SHIFT) - 1)) {
pr_err("Zone capacity is not aligned to block size %u\n",
zlo->block_size);
return -EINVAL;
}
return 0;
}
static int zloop_init_zone(struct zloop_device *zlo, struct zloop_options *opts,
unsigned int zone_no, bool restore)
{
struct zloop_zone *zone = &zlo->zones[zone_no];
int oflags = O_RDWR;
struct kstat stat;
sector_t file_sectors;
int ret;
mutex_init(&zone->lock);
zone->start = (sector_t)zone_no << zlo->zone_shift;
if (!restore)
oflags |= O_CREAT;
if (!opts->buffered_io)
oflags |= O_DIRECT;
if (zone_no < zlo->nr_conv_zones) {
/* Conventional zone file. */
set_bit(ZLOOP_ZONE_CONV, &zone->flags);
zone->cond = BLK_ZONE_COND_NOT_WP;
zone->wp = U64_MAX;
zone->file = zloop_filp_open_fmt(oflags, 0600, "%s/%u/cnv-%06u",
zlo->base_dir, zlo->id, zone_no);
if (IS_ERR(zone->file)) {
pr_err("Failed to open zone %u file %s/%u/cnv-%06u (err=%ld)",
zone_no, zlo->base_dir, zlo->id, zone_no,
PTR_ERR(zone->file));
return PTR_ERR(zone->file);
}
if (!zlo->block_size) {
ret = zloop_get_block_size(zlo, zone);
if (ret)
return ret;
}
ret = vfs_getattr(&zone->file->f_path, &stat, STATX_SIZE, 0);
if (ret < 0) {
pr_err("Failed to get zone %u file stat\n", zone_no);
return ret;
}
file_sectors = stat.size >> SECTOR_SHIFT;
if (restore && file_sectors != zlo->zone_size) {
pr_err("Invalid conventional zone %u file size (%llu sectors != %llu)\n",
zone_no, file_sectors, zlo->zone_capacity);
return ret;
}
ret = vfs_truncate(&zone->file->f_path,
zlo->zone_size << SECTOR_SHIFT);
if (ret < 0) {
pr_err("Failed to truncate zone %u file (err=%d)\n",
zone_no, ret);
return ret;
}
return 0;
}
/* Sequential zone file. */
zone->file = zloop_filp_open_fmt(oflags, 0600, "%s/%u/seq-%06u",
zlo->base_dir, zlo->id, zone_no);
if (IS_ERR(zone->file)) {
pr_err("Failed to open zone %u file %s/%u/seq-%06u (err=%ld)",
zone_no, zlo->base_dir, zlo->id, zone_no,
PTR_ERR(zone->file));
return PTR_ERR(zone->file);
}
if (!zlo->block_size) {
ret = zloop_get_block_size(zlo, zone);
if (ret)
return ret;
}
zloop_get_block_size(zlo, zone);
mutex_lock(&zone->lock);
ret = zloop_update_seq_zone(zlo, zone_no);
mutex_unlock(&zone->lock);
return ret;
}
static bool zloop_dev_exists(struct zloop_device *zlo)
{
struct file *cnv, *seq;
bool exists;
cnv = zloop_filp_open_fmt(O_RDONLY, 0600, "%s/%u/cnv-%06u",
zlo->base_dir, zlo->id, 0);
seq = zloop_filp_open_fmt(O_RDONLY, 0600, "%s/%u/seq-%06u",
zlo->base_dir, zlo->id, 0);
exists = !IS_ERR(cnv) || !IS_ERR(seq);
if (!IS_ERR(cnv))
fput(cnv);
if (!IS_ERR(seq))
fput(seq);
return exists;
}
static int zloop_ctl_add(struct zloop_options *opts)
{
struct queue_limits lim = {
.max_hw_sectors = SZ_1M >> SECTOR_SHIFT,
.max_hw_zone_append_sectors = SZ_1M >> SECTOR_SHIFT,
.chunk_sectors = opts->zone_size,
.features = BLK_FEAT_ZONED,
};
unsigned int nr_zones, i, j;
struct zloop_device *zlo;
int ret = -EINVAL;
bool restore;
__module_get(THIS_MODULE);
nr_zones = opts->capacity >> ilog2(opts->zone_size);
if (opts->nr_conv_zones >= nr_zones) {
pr_err("Invalid number of conventional zones %u\n",
opts->nr_conv_zones);
goto out;
}
zlo = kvzalloc(struct_size(zlo, zones, nr_zones), GFP_KERNEL);
if (!zlo) {
ret = -ENOMEM;
goto out;
}
zlo->state = Zlo_creating;
ret = mutex_lock_killable(&zloop_ctl_mutex);
if (ret)
goto out_free_dev;
/* Allocate id, if @opts->id >= 0, we're requesting that specific id */
if (opts->id >= 0) {
ret = idr_alloc(&zloop_index_idr, zlo,
opts->id, opts->id + 1, GFP_KERNEL);
if (ret == -ENOSPC)
ret = -EEXIST;
} else {
ret = idr_alloc(&zloop_index_idr, zlo, 0, 0, GFP_KERNEL);
}
mutex_unlock(&zloop_ctl_mutex);
if (ret < 0)
goto out_free_dev;
zlo->id = ret;
zlo->zone_shift = ilog2(opts->zone_size);
zlo->zone_size = opts->zone_size;
if (opts->zone_capacity)
zlo->zone_capacity = opts->zone_capacity;
else
zlo->zone_capacity = zlo->zone_size;
zlo->nr_zones = nr_zones;
zlo->nr_conv_zones = opts->nr_conv_zones;
zlo->buffered_io = opts->buffered_io;
zlo->workqueue = alloc_workqueue("zloop%d", WQ_UNBOUND | WQ_FREEZABLE,
opts->nr_queues * opts->queue_depth, zlo->id);
if (!zlo->workqueue) {
ret = -ENOMEM;
goto out_free_idr;
}
if (opts->base_dir)
zlo->base_dir = kstrdup(opts->base_dir, GFP_KERNEL);
else
zlo->base_dir = kstrdup(ZLOOP_DEF_BASE_DIR, GFP_KERNEL);
if (!zlo->base_dir) {
ret = -ENOMEM;
goto out_destroy_workqueue;
}
zlo->data_dir = zloop_filp_open_fmt(O_RDONLY | O_DIRECTORY, 0, "%s/%u",
zlo->base_dir, zlo->id);
if (IS_ERR(zlo->data_dir)) {
ret = PTR_ERR(zlo->data_dir);
pr_warn("Failed to open directory %s/%u (err=%d)\n",
zlo->base_dir, zlo->id, ret);
goto out_free_base_dir;
}
/*
* If we already have zone files, we are restoring a device created by a
* previous add operation. In this case, zloop_init_zone() will check
* that the zone files are consistent with the zone configuration given.
*/
restore = zloop_dev_exists(zlo);
for (i = 0; i < nr_zones; i++) {
ret = zloop_init_zone(zlo, opts, i, restore);
if (ret)
goto out_close_files;
}
lim.physical_block_size = zlo->block_size;
lim.logical_block_size = zlo->block_size;
zlo->tag_set.ops = &zloop_mq_ops;
zlo->tag_set.nr_hw_queues = opts->nr_queues;
zlo->tag_set.queue_depth = opts->queue_depth;
zlo->tag_set.numa_node = NUMA_NO_NODE;
zlo->tag_set.cmd_size = sizeof(struct zloop_cmd);
zlo->tag_set.driver_data = zlo;
ret = blk_mq_alloc_tag_set(&zlo->tag_set);
if (ret) {
pr_err("blk_mq_alloc_tag_set failed (err=%d)\n", ret);
goto out_close_files;
}
zlo->disk = blk_mq_alloc_disk(&zlo->tag_set, &lim, zlo);
if (IS_ERR(zlo->disk)) {
pr_err("blk_mq_alloc_disk failed (err=%d)\n", ret);
ret = PTR_ERR(zlo->disk);
goto out_cleanup_tags;
}
zlo->disk->flags = GENHD_FL_NO_PART;
zlo->disk->fops = &zloop_fops;
zlo->disk->private_data = zlo;
sprintf(zlo->disk->disk_name, "zloop%d", zlo->id);
set_capacity(zlo->disk, (u64)lim.chunk_sectors * zlo->nr_zones);
ret = blk_revalidate_disk_zones(zlo->disk);
if (ret)
goto out_cleanup_disk;
ret = add_disk(zlo->disk);
if (ret) {
pr_err("add_disk failed (err=%d)\n", ret);
goto out_cleanup_disk;
}
mutex_lock(&zloop_ctl_mutex);
zlo->state = Zlo_live;
mutex_unlock(&zloop_ctl_mutex);
pr_info("Added device %d: %u zones of %llu MB, %u B block size\n",
zlo->id, zlo->nr_zones,
((sector_t)zlo->zone_size << SECTOR_SHIFT) >> 20,
zlo->block_size);
return 0;
out_cleanup_disk:
put_disk(zlo->disk);
out_cleanup_tags:
blk_mq_free_tag_set(&zlo->tag_set);
out_close_files:
for (j = 0; j < i; j++) {
struct zloop_zone *zone = &zlo->zones[j];
if (!IS_ERR_OR_NULL(zone->file))
fput(zone->file);
}
fput(zlo->data_dir);
out_free_base_dir:
kfree(zlo->base_dir);
out_destroy_workqueue:
destroy_workqueue(zlo->workqueue);
out_free_idr:
mutex_lock(&zloop_ctl_mutex);
idr_remove(&zloop_index_idr, zlo->id);
mutex_unlock(&zloop_ctl_mutex);
out_free_dev:
kvfree(zlo);
out:
module_put(THIS_MODULE);
if (ret == -ENOENT)
ret = -EINVAL;
return ret;
}
static int zloop_ctl_remove(struct zloop_options *opts)
{
struct zloop_device *zlo;
int ret;
if (!(opts->mask & ZLOOP_OPT_ID)) {
pr_err("No ID specified\n");
return -EINVAL;
}
ret = mutex_lock_killable(&zloop_ctl_mutex);
if (ret)
return ret;
zlo = idr_find(&zloop_index_idr, opts->id);
if (!zlo || zlo->state == Zlo_creating) {
ret = -ENODEV;
} else if (zlo->state == Zlo_deleting) {
ret = -EINVAL;
} else {
idr_remove(&zloop_index_idr, zlo->id);
zlo->state = Zlo_deleting;
}
mutex_unlock(&zloop_ctl_mutex);
if (ret)
return ret;
del_gendisk(zlo->disk);
put_disk(zlo->disk);
blk_mq_free_tag_set(&zlo->tag_set);
pr_info("Removed device %d\n", opts->id);
module_put(THIS_MODULE);
return 0;
}
static int zloop_parse_options(struct zloop_options *opts, const char *buf)
{
substring_t args[MAX_OPT_ARGS];
char *options, *o, *p;
unsigned int token;
int ret = 0;
/* Set defaults. */
opts->mask = 0;
opts->id = ZLOOP_DEF_ID;
opts->capacity = ZLOOP_DEF_ZONE_SIZE * ZLOOP_DEF_NR_ZONES;
opts->zone_size = ZLOOP_DEF_ZONE_SIZE;
opts->nr_conv_zones = ZLOOP_DEF_NR_CONV_ZONES;
opts->nr_queues = ZLOOP_DEF_NR_QUEUES;
opts->queue_depth = ZLOOP_DEF_QUEUE_DEPTH;
opts->buffered_io = ZLOOP_DEF_BUFFERED_IO;
if (!buf)
return 0;
/* Skip leading spaces before the options. */
while (isspace(*buf))
buf++;
options = o = kstrdup(buf, GFP_KERNEL);
if (!options)
return -ENOMEM;
/* Parse the options, doing only some light invalid value checks. */
while ((p = strsep(&o, ",\n")) != NULL) {
if (!*p)
continue;
token = match_token(p, zloop_opt_tokens, args);
opts->mask |= token;
switch (token) {
case ZLOOP_OPT_ID:
if (match_int(args, &opts->id)) {
ret = -EINVAL;
goto out;
}
break;
case ZLOOP_OPT_CAPACITY:
if (match_uint(args, &token)) {
ret = -EINVAL;
goto out;
}
if (!token) {
pr_err("Invalid capacity\n");
ret = -EINVAL;
goto out;
}
opts->capacity =
((sector_t)token * SZ_1M) >> SECTOR_SHIFT;
break;
case ZLOOP_OPT_ZONE_SIZE:
if (match_uint(args, &token)) {
ret = -EINVAL;
goto out;
}
if (!token || token > ZLOOP_MAX_ZONE_SIZE_MB ||
!is_power_of_2(token)) {
pr_err("Invalid zone size %u\n", token);
ret = -EINVAL;
goto out;
}
opts->zone_size =
((sector_t)token * SZ_1M) >> SECTOR_SHIFT;
break;
case ZLOOP_OPT_ZONE_CAPACITY:
if (match_uint(args, &token)) {
ret = -EINVAL;
goto out;
}
if (!token) {
pr_err("Invalid zone capacity\n");
ret = -EINVAL;
goto out;
}
opts->zone_capacity =
((sector_t)token * SZ_1M) >> SECTOR_SHIFT;
break;
case ZLOOP_OPT_NR_CONV_ZONES:
if (match_uint(args, &token)) {
ret = -EINVAL;
goto out;
}
opts->nr_conv_zones = token;
break;
case ZLOOP_OPT_BASE_DIR:
p = match_strdup(args);
if (!p) {
ret = -ENOMEM;
goto out;
}
kfree(opts->base_dir);
opts->base_dir = p;
break;
case ZLOOP_OPT_NR_QUEUES:
if (match_uint(args, &token)) {
ret = -EINVAL;
goto out;
}
if (!token) {
pr_err("Invalid number of queues\n");
ret = -EINVAL;
goto out;
}
opts->nr_queues = min(token, num_online_cpus());
break;
case ZLOOP_OPT_QUEUE_DEPTH:
if (match_uint(args, &token)) {
ret = -EINVAL;
goto out;
}
if (!token) {
pr_err("Invalid queue depth\n");
ret = -EINVAL;
goto out;
}
opts->queue_depth = token;
break;
case ZLOOP_OPT_BUFFERED_IO:
opts->buffered_io = true;
break;
case ZLOOP_OPT_ERR:
default:
pr_warn("unknown parameter or missing value '%s'\n", p);
ret = -EINVAL;
goto out;
}
}
ret = -EINVAL;
if (opts->capacity <= opts->zone_size) {
pr_err("Invalid capacity\n");
goto out;
}
if (opts->zone_capacity > opts->zone_size) {
pr_err("Invalid zone capacity\n");
goto out;
}
ret = 0;
out:
kfree(options);
return ret;
}
enum {
ZLOOP_CTL_ADD,
ZLOOP_CTL_REMOVE,
};
static struct zloop_ctl_op {
int code;
const char *name;
} zloop_ctl_ops[] = {
{ ZLOOP_CTL_ADD, "add" },
{ ZLOOP_CTL_REMOVE, "remove" },
{ -1, NULL },
};
static ssize_t zloop_ctl_write(struct file *file, const char __user *ubuf,
size_t count, loff_t *pos)
{
struct zloop_options opts = { };
struct zloop_ctl_op *op;
const char *buf, *opts_buf;
int i, ret;
if (count > PAGE_SIZE)
return -ENOMEM;
buf = memdup_user_nul(ubuf, count);
if (IS_ERR(buf))
return PTR_ERR(buf);
for (i = 0; i < ARRAY_SIZE(zloop_ctl_ops); i++) {
op = &zloop_ctl_ops[i];
if (!op->name) {
pr_err("Invalid operation\n");
ret = -EINVAL;
goto out;
}
if (!strncmp(buf, op->name, strlen(op->name)))
break;
}
if (count <= strlen(op->name))
opts_buf = NULL;
else
opts_buf = buf + strlen(op->name);
ret = zloop_parse_options(&opts, opts_buf);
if (ret) {
pr_err("Failed to parse options\n");
goto out;
}
switch (op->code) {
case ZLOOP_CTL_ADD:
ret = zloop_ctl_add(&opts);
break;
case ZLOOP_CTL_REMOVE:
ret = zloop_ctl_remove(&opts);
break;
default:
pr_err("Invalid operation\n");
ret = -EINVAL;
goto out;
}
out:
kfree(opts.base_dir);
kfree(buf);
return ret ? ret : count;
}
static int zloop_ctl_show(struct seq_file *seq_file, void *private)
{
const struct match_token *tok;
int i;
/* Add operation */
seq_printf(seq_file, "%s ", zloop_ctl_ops[0].name);
for (i = 0; i < ARRAY_SIZE(zloop_opt_tokens); i++) {
tok = &zloop_opt_tokens[i];
if (!tok->pattern)
break;
if (i)
seq_putc(seq_file, ',');
seq_puts(seq_file, tok->pattern);
}
seq_putc(seq_file, '\n');
/* Remove operation */
seq_puts(seq_file, zloop_ctl_ops[1].name);
seq_puts(seq_file, " id=%d\n");
return 0;
}
static int zloop_ctl_open(struct inode *inode, struct file *file)
{
file->private_data = NULL;
return single_open(file, zloop_ctl_show, NULL);
}
static int zloop_ctl_release(struct inode *inode, struct file *file)
{
return single_release(inode, file);
}
static const struct file_operations zloop_ctl_fops = {
.owner = THIS_MODULE,
.open = zloop_ctl_open,
.release = zloop_ctl_release,
.write = zloop_ctl_write,
.read = seq_read,
};
static struct miscdevice zloop_misc = {
.minor = MISC_DYNAMIC_MINOR,
.name = "zloop-control",
.fops = &zloop_ctl_fops,
};
static int __init zloop_init(void)
{
int ret;
ret = misc_register(&zloop_misc);
if (ret) {
pr_err("Failed to register misc device: %d\n", ret);
return ret;
}
pr_info("Module loaded\n");
return 0;
}
static void __exit zloop_exit(void)
{
misc_deregister(&zloop_misc);
idr_destroy(&zloop_index_idr);
}
module_init(zloop_init);
module_exit(zloop_exit);
MODULE_DESCRIPTION("Zoned loopback device");
MODULE_LICENSE("GPL");