|  | // SPDX-License-Identifier: GPL-2.0 | 
|  | /* | 
|  | * Some low level IO code, and hacks for various block layer limitations | 
|  | * | 
|  | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | 
|  | * Copyright 2012 Google, Inc. | 
|  | */ | 
|  |  | 
|  | #include "bcachefs.h" | 
|  | #include "alloc_background.h" | 
|  | #include "alloc_foreground.h" | 
|  | #include "btree_update.h" | 
|  | #include "buckets.h" | 
|  | #include "checksum.h" | 
|  | #include "clock.h" | 
|  | #include "compress.h" | 
|  | #include "data_update.h" | 
|  | #include "disk_groups.h" | 
|  | #include "ec.h" | 
|  | #include "error.h" | 
|  | #include "io_read.h" | 
|  | #include "io_misc.h" | 
|  | #include "io_write.h" | 
|  | #include "reflink.h" | 
|  | #include "subvolume.h" | 
|  | #include "trace.h" | 
|  |  | 
|  | #include <linux/random.h> | 
|  | #include <linux/sched/mm.h> | 
|  |  | 
|  | #ifdef CONFIG_BCACHEFS_DEBUG | 
|  | static unsigned bch2_read_corrupt_ratio; | 
|  | module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644); | 
|  | MODULE_PARM_DESC(read_corrupt_ratio, ""); | 
|  | #endif | 
|  |  | 
|  | #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT | 
|  |  | 
|  | static bool bch2_target_congested(struct bch_fs *c, u16 target) | 
|  | { | 
|  | const struct bch_devs_mask *devs; | 
|  | unsigned d, nr = 0, total = 0; | 
|  | u64 now = local_clock(), last; | 
|  | s64 congested; | 
|  | struct bch_dev *ca; | 
|  |  | 
|  | if (!target) | 
|  | return false; | 
|  |  | 
|  | rcu_read_lock(); | 
|  | devs = bch2_target_to_mask(c, target) ?: | 
|  | &c->rw_devs[BCH_DATA_user]; | 
|  |  | 
|  | for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { | 
|  | ca = rcu_dereference(c->devs[d]); | 
|  | if (!ca) | 
|  | continue; | 
|  |  | 
|  | congested = atomic_read(&ca->congested); | 
|  | last = READ_ONCE(ca->congested_last); | 
|  | if (time_after64(now, last)) | 
|  | congested -= (now - last) >> 12; | 
|  |  | 
|  | total += max(congested, 0LL); | 
|  | nr++; | 
|  | } | 
|  | rcu_read_unlock(); | 
|  |  | 
|  | return get_random_u32_below(nr * CONGESTED_MAX) < total; | 
|  | } | 
|  |  | 
|  | #else | 
|  |  | 
|  | static bool bch2_target_congested(struct bch_fs *c, u16 target) | 
|  | { | 
|  | return false; | 
|  | } | 
|  |  | 
|  | #endif | 
|  |  | 
|  | /* Cache promotion on read */ | 
|  |  | 
|  | struct promote_op { | 
|  | struct rcu_head		rcu; | 
|  | u64			start_time; | 
|  |  | 
|  | struct rhash_head	hash; | 
|  | struct bpos		pos; | 
|  |  | 
|  | struct work_struct	work; | 
|  | struct data_update	write; | 
|  | struct bio_vec		bi_inline_vecs[]; /* must be last */ | 
|  | }; | 
|  |  | 
|  | static const struct rhashtable_params bch_promote_params = { | 
|  | .head_offset		= offsetof(struct promote_op, hash), | 
|  | .key_offset		= offsetof(struct promote_op, pos), | 
|  | .key_len		= sizeof(struct bpos), | 
|  | .automatic_shrinking	= true, | 
|  | }; | 
|  |  | 
|  | static inline bool have_io_error(struct bch_io_failures *failed) | 
|  | { | 
|  | return failed && failed->nr; | 
|  | } | 
|  |  | 
|  | static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio) | 
|  | { | 
|  | EBUG_ON(rbio->split); | 
|  |  | 
|  | return rbio->data_update | 
|  | ? container_of(rbio, struct data_update, rbio) | 
|  | : NULL; | 
|  | } | 
|  |  | 
|  | static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev) | 
|  | { | 
|  | struct data_update *u = rbio_data_update(orig); | 
|  | if (!u) | 
|  | return false; | 
|  |  | 
|  | struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k)); | 
|  | unsigned i = 0; | 
|  | bkey_for_each_ptr(ptrs, ptr) { | 
|  | if (ptr->dev == dev && | 
|  | u->data_opts.rewrite_ptrs & BIT(i)) | 
|  | return true; | 
|  | i++; | 
|  | } | 
|  |  | 
|  | return false; | 
|  | } | 
|  |  | 
|  | static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, | 
|  | struct bpos pos, | 
|  | struct bch_io_opts opts, | 
|  | unsigned flags, | 
|  | struct bch_io_failures *failed) | 
|  | { | 
|  | if (!have_io_error(failed)) { | 
|  | BUG_ON(!opts.promote_target); | 
|  |  | 
|  | if (!(flags & BCH_READ_may_promote)) | 
|  | return -BCH_ERR_nopromote_may_not; | 
|  |  | 
|  | if (bch2_bkey_has_target(c, k, opts.promote_target)) | 
|  | return -BCH_ERR_nopromote_already_promoted; | 
|  |  | 
|  | if (bkey_extent_is_unwritten(k)) | 
|  | return -BCH_ERR_nopromote_unwritten; | 
|  |  | 
|  | if (bch2_target_congested(c, opts.promote_target)) | 
|  | return -BCH_ERR_nopromote_congested; | 
|  | } | 
|  |  | 
|  | if (rhashtable_lookup_fast(&c->promote_table, &pos, | 
|  | bch_promote_params)) | 
|  | return -BCH_ERR_nopromote_in_flight; | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static noinline void promote_free(struct bch_read_bio *rbio) | 
|  | { | 
|  | struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); | 
|  | struct bch_fs *c = rbio->c; | 
|  |  | 
|  | int ret = rhashtable_remove_fast(&c->promote_table, &op->hash, | 
|  | bch_promote_params); | 
|  | BUG_ON(ret); | 
|  |  | 
|  | bch2_data_update_exit(&op->write); | 
|  |  | 
|  | bch2_write_ref_put(c, BCH_WRITE_REF_promote); | 
|  | kfree_rcu(op, rcu); | 
|  | } | 
|  |  | 
|  | static void promote_done(struct bch_write_op *wop) | 
|  | { | 
|  | struct promote_op *op = container_of(wop, struct promote_op, write.op); | 
|  | struct bch_fs *c = op->write.rbio.c; | 
|  |  | 
|  | bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time); | 
|  | promote_free(&op->write.rbio); | 
|  | } | 
|  |  | 
|  | static void promote_start_work(struct work_struct *work) | 
|  | { | 
|  | struct promote_op *op = container_of(work, struct promote_op, work); | 
|  |  | 
|  | bch2_data_update_read_done(&op->write); | 
|  | } | 
|  |  | 
|  | static noinline void promote_start(struct bch_read_bio *rbio) | 
|  | { | 
|  | struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); | 
|  |  | 
|  | trace_and_count(op->write.op.c, io_read_promote, &rbio->bio); | 
|  |  | 
|  | INIT_WORK(&op->work, promote_start_work); | 
|  | queue_work(rbio->c->write_ref_wq, &op->work); | 
|  | } | 
|  |  | 
|  | static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, | 
|  | enum btree_id btree_id, | 
|  | struct bkey_s_c k, | 
|  | struct bpos pos, | 
|  | struct extent_ptr_decoded *pick, | 
|  | unsigned sectors, | 
|  | struct bch_read_bio *orig, | 
|  | struct bch_io_failures *failed) | 
|  | { | 
|  | struct bch_fs *c = trans->c; | 
|  | int ret; | 
|  |  | 
|  | struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait }; | 
|  |  | 
|  | if (!have_io_error(failed)) { | 
|  | update_opts.target = orig->opts.promote_target; | 
|  | update_opts.extra_replicas = 1; | 
|  | update_opts.write_flags |= BCH_WRITE_cached; | 
|  | update_opts.write_flags |= BCH_WRITE_only_specified_devs; | 
|  | } else { | 
|  | update_opts.target = orig->opts.foreground_target; | 
|  |  | 
|  | struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); | 
|  | unsigned ptr_bit = 1; | 
|  | bkey_for_each_ptr(ptrs, ptr) { | 
|  | if (bch2_dev_io_failures(failed, ptr->dev) && | 
|  | !ptr_being_rewritten(orig, ptr->dev)) | 
|  | update_opts.rewrite_ptrs |= ptr_bit; | 
|  | ptr_bit <<= 1; | 
|  | } | 
|  |  | 
|  | if (!update_opts.rewrite_ptrs) | 
|  | return NULL; | 
|  | } | 
|  |  | 
|  | if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) | 
|  | return ERR_PTR(-BCH_ERR_nopromote_no_writes); | 
|  |  | 
|  | struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL); | 
|  | if (!op) { | 
|  | ret = -BCH_ERR_nopromote_enomem; | 
|  | goto err_put; | 
|  | } | 
|  |  | 
|  | op->start_time = local_clock(); | 
|  | op->pos = pos; | 
|  |  | 
|  | if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, | 
|  | bch_promote_params)) { | 
|  | ret = -BCH_ERR_nopromote_in_flight; | 
|  | goto err; | 
|  | } | 
|  |  | 
|  | ret = bch2_data_update_init(trans, NULL, NULL, &op->write, | 
|  | writepoint_hashed((unsigned long) current), | 
|  | &orig->opts, | 
|  | update_opts, | 
|  | btree_id, k); | 
|  | op->write.type = BCH_DATA_UPDATE_promote; | 
|  | /* | 
|  | * possible errors: -BCH_ERR_nocow_lock_blocked, | 
|  | * -BCH_ERR_ENOSPC_disk_reservation: | 
|  | */ | 
|  | if (ret) | 
|  | goto err_remove_hash; | 
|  |  | 
|  | rbio_init_fragment(&op->write.rbio.bio, orig); | 
|  | op->write.rbio.bounce	= true; | 
|  | op->write.rbio.promote	= true; | 
|  | op->write.op.end_io = promote_done; | 
|  |  | 
|  | return &op->write.rbio; | 
|  | err_remove_hash: | 
|  | BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, | 
|  | bch_promote_params)); | 
|  | err: | 
|  | bio_free_pages(&op->write.op.wbio.bio); | 
|  | /* We may have added to the rhashtable and thus need rcu freeing: */ | 
|  | kfree_rcu(op, rcu); | 
|  | err_put: | 
|  | bch2_write_ref_put(c, BCH_WRITE_REF_promote); | 
|  | return ERR_PTR(ret); | 
|  | } | 
|  |  | 
|  | noinline | 
|  | static struct bch_read_bio *promote_alloc(struct btree_trans *trans, | 
|  | struct bvec_iter iter, | 
|  | struct bkey_s_c k, | 
|  | struct extent_ptr_decoded *pick, | 
|  | unsigned flags, | 
|  | struct bch_read_bio *orig, | 
|  | bool *bounce, | 
|  | bool *read_full, | 
|  | struct bch_io_failures *failed) | 
|  | { | 
|  | struct bch_fs *c = trans->c; | 
|  | /* | 
|  | * if failed != NULL we're not actually doing a promote, we're | 
|  | * recovering from an io/checksum error | 
|  | */ | 
|  | bool promote_full = (have_io_error(failed) || | 
|  | *read_full || | 
|  | READ_ONCE(c->opts.promote_whole_extents)); | 
|  | /* data might have to be decompressed in the write path: */ | 
|  | unsigned sectors = promote_full | 
|  | ? max(pick->crc.compressed_size, pick->crc.live_size) | 
|  | : bvec_iter_sectors(iter); | 
|  | struct bpos pos = promote_full | 
|  | ? bkey_start_pos(k.k) | 
|  | : POS(k.k->p.inode, iter.bi_sector); | 
|  | int ret; | 
|  |  | 
|  | ret = should_promote(c, k, pos, orig->opts, flags, failed); | 
|  | if (ret) | 
|  | goto nopromote; | 
|  |  | 
|  | struct bch_read_bio *promote = | 
|  | __promote_alloc(trans, | 
|  | k.k->type == KEY_TYPE_reflink_v | 
|  | ? BTREE_ID_reflink | 
|  | : BTREE_ID_extents, | 
|  | k, pos, pick, sectors, orig, failed); | 
|  | if (!promote) | 
|  | return NULL; | 
|  |  | 
|  | ret = PTR_ERR_OR_ZERO(promote); | 
|  | if (ret) | 
|  | goto nopromote; | 
|  |  | 
|  | *bounce		= true; | 
|  | *read_full	= promote_full; | 
|  | return promote; | 
|  | nopromote: | 
|  | trace_io_read_nopromote(c, ret); | 
|  | return NULL; | 
|  | } | 
|  |  | 
|  | /* Read */ | 
|  |  | 
|  | static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out, | 
|  | struct bch_read_bio *rbio, struct bpos read_pos) | 
|  | { | 
|  | int ret = lockrestart_do(trans, | 
|  | bch2_inum_offset_err_msg_trans(trans, out, | 
|  | (subvol_inum) { rbio->subvol, read_pos.inode }, | 
|  | read_pos.offset << 9)); | 
|  | if (ret) | 
|  | return ret; | 
|  |  | 
|  | if (rbio->data_update) | 
|  | prt_str(out, "(internal move) "); | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, | 
|  | struct bch_read_bio *rbio, struct bpos read_pos) | 
|  | { | 
|  | bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos)); | 
|  | } | 
|  |  | 
|  | enum rbio_context { | 
|  | RBIO_CONTEXT_NULL, | 
|  | RBIO_CONTEXT_HIGHPRI, | 
|  | RBIO_CONTEXT_UNBOUND, | 
|  | }; | 
|  |  | 
|  | static inline struct bch_read_bio * | 
|  | bch2_rbio_parent(struct bch_read_bio *rbio) | 
|  | { | 
|  | return rbio->split ? rbio->parent : rbio; | 
|  | } | 
|  |  | 
|  | __always_inline | 
|  | static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, | 
|  | enum rbio_context context, | 
|  | struct workqueue_struct *wq) | 
|  | { | 
|  | if (context <= rbio->context) { | 
|  | fn(&rbio->work); | 
|  | } else { | 
|  | rbio->work.func		= fn; | 
|  | rbio->context		= context; | 
|  | queue_work(wq, &rbio->work); | 
|  | } | 
|  | } | 
|  |  | 
|  | static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) | 
|  | { | 
|  | BUG_ON(rbio->bounce && !rbio->split); | 
|  |  | 
|  | if (rbio->have_ioref) { | 
|  | struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev); | 
|  | percpu_ref_put(&ca->io_ref[READ]); | 
|  | } | 
|  |  | 
|  | if (rbio->split) { | 
|  | struct bch_read_bio *parent = rbio->parent; | 
|  |  | 
|  | if (unlikely(rbio->promote)) { | 
|  | if (!rbio->bio.bi_status) | 
|  | promote_start(rbio); | 
|  | else | 
|  | promote_free(rbio); | 
|  | } else { | 
|  | if (rbio->bounce) | 
|  | bch2_bio_free_pages_pool(rbio->c, &rbio->bio); | 
|  |  | 
|  | bio_put(&rbio->bio); | 
|  | } | 
|  |  | 
|  | rbio = parent; | 
|  | } | 
|  |  | 
|  | return rbio; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Only called on a top level bch_read_bio to complete an entire read request, | 
|  | * not a split: | 
|  | */ | 
|  | static void bch2_rbio_done(struct bch_read_bio *rbio) | 
|  | { | 
|  | if (rbio->start_time) | 
|  | bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], | 
|  | rbio->start_time); | 
|  | bio_endio(&rbio->bio); | 
|  | } | 
|  |  | 
|  | static noinline int bch2_read_retry_nodecode(struct btree_trans *trans, | 
|  | struct bch_read_bio *rbio, | 
|  | struct bvec_iter bvec_iter, | 
|  | struct bch_io_failures *failed, | 
|  | unsigned flags) | 
|  | { | 
|  | struct data_update *u = container_of(rbio, struct data_update, rbio); | 
|  | retry: | 
|  | bch2_trans_begin(trans); | 
|  |  | 
|  | struct btree_iter iter; | 
|  | struct bkey_s_c k; | 
|  | int ret = lockrestart_do(trans, | 
|  | bkey_err(k = bch2_bkey_get_iter(trans, &iter, | 
|  | u->btree_id, bkey_start_pos(&u->k.k->k), | 
|  | 0))); | 
|  | if (ret) | 
|  | goto err; | 
|  |  | 
|  | if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) { | 
|  | /* extent we wanted to read no longer exists: */ | 
|  | rbio->ret = -BCH_ERR_data_read_key_overwritten; | 
|  | goto err; | 
|  | } | 
|  |  | 
|  | ret = __bch2_read_extent(trans, rbio, bvec_iter, | 
|  | bkey_start_pos(&u->k.k->k), | 
|  | u->btree_id, | 
|  | bkey_i_to_s_c(u->k.k), | 
|  | 0, failed, flags, -1); | 
|  | err: | 
|  | bch2_trans_iter_exit(trans, &iter); | 
|  |  | 
|  | if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) | 
|  | goto retry; | 
|  |  | 
|  | if (ret) { | 
|  | rbio->bio.bi_status	= BLK_STS_IOERR; | 
|  | rbio->ret		= ret; | 
|  | } | 
|  |  | 
|  | BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1); | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | static void bch2_rbio_retry(struct work_struct *work) | 
|  | { | 
|  | struct bch_read_bio *rbio = | 
|  | container_of(work, struct bch_read_bio, work); | 
|  | struct bch_fs *c	= rbio->c; | 
|  | struct bvec_iter iter	= rbio->bvec_iter; | 
|  | unsigned flags		= rbio->flags; | 
|  | subvol_inum inum = { | 
|  | .subvol = rbio->subvol, | 
|  | .inum	= rbio->read_pos.inode, | 
|  | }; | 
|  | struct bch_io_failures failed = { .nr = 0 }; | 
|  | struct btree_trans *trans = bch2_trans_get(c); | 
|  |  | 
|  | trace_io_read_retry(&rbio->bio); | 
|  | this_cpu_add(c->counters[BCH_COUNTER_io_read_retry], | 
|  | bvec_iter_sectors(rbio->bvec_iter)); | 
|  |  | 
|  | if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) | 
|  | bch2_mark_io_failure(&failed, &rbio->pick, | 
|  | rbio->ret == -BCH_ERR_data_read_retry_csum_err); | 
|  |  | 
|  | if (!rbio->split) { | 
|  | rbio->bio.bi_status	= 0; | 
|  | rbio->ret		= 0; | 
|  | } | 
|  |  | 
|  | unsigned subvol		= rbio->subvol; | 
|  | struct bpos read_pos	= rbio->read_pos; | 
|  |  | 
|  | rbio = bch2_rbio_free(rbio); | 
|  |  | 
|  | flags |= BCH_READ_in_retry; | 
|  | flags &= ~BCH_READ_may_promote; | 
|  | flags &= ~BCH_READ_last_fragment; | 
|  | flags |= BCH_READ_must_clone; | 
|  |  | 
|  | int ret = rbio->data_update | 
|  | ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags) | 
|  | : __bch2_read(trans, rbio, iter, inum, &failed, flags); | 
|  |  | 
|  | if (ret) { | 
|  | rbio->ret = ret; | 
|  | rbio->bio.bi_status = BLK_STS_IOERR; | 
|  | } else { | 
|  | struct printbuf buf = PRINTBUF; | 
|  |  | 
|  | lockrestart_do(trans, | 
|  | bch2_inum_offset_err_msg_trans(trans, &buf, | 
|  | (subvol_inum) { subvol, read_pos.inode }, | 
|  | read_pos.offset << 9)); | 
|  | if (rbio->data_update) | 
|  | prt_str(&buf, "(internal move) "); | 
|  | prt_str(&buf, "successful retry"); | 
|  |  | 
|  | bch_err_ratelimited(c, "%s", buf.buf); | 
|  | printbuf_exit(&buf); | 
|  | } | 
|  |  | 
|  | bch2_rbio_done(rbio); | 
|  | bch2_trans_put(trans); | 
|  | } | 
|  |  | 
|  | static void bch2_rbio_error(struct bch_read_bio *rbio, | 
|  | int ret, blk_status_t blk_error) | 
|  | { | 
|  | BUG_ON(ret >= 0); | 
|  |  | 
|  | rbio->ret		= ret; | 
|  | rbio->bio.bi_status	= blk_error; | 
|  |  | 
|  | bch2_rbio_parent(rbio)->saw_error = true; | 
|  |  | 
|  | if (rbio->flags & BCH_READ_in_retry) | 
|  | return; | 
|  |  | 
|  | if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) { | 
|  | bch2_rbio_punt(rbio, bch2_rbio_retry, | 
|  | RBIO_CONTEXT_UNBOUND, system_unbound_wq); | 
|  | } else { | 
|  | rbio = bch2_rbio_free(rbio); | 
|  |  | 
|  | rbio->ret		= ret; | 
|  | rbio->bio.bi_status	= blk_error; | 
|  |  | 
|  | bch2_rbio_done(rbio); | 
|  | } | 
|  | } | 
|  |  | 
|  | static void bch2_read_io_err(struct work_struct *work) | 
|  | { | 
|  | struct bch_read_bio *rbio = | 
|  | container_of(work, struct bch_read_bio, work); | 
|  | struct bio *bio = &rbio->bio; | 
|  | struct bch_fs *c	= rbio->c; | 
|  | struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; | 
|  | struct printbuf buf = PRINTBUF; | 
|  |  | 
|  | bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); | 
|  | prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status)); | 
|  |  | 
|  | if (ca) | 
|  | bch_err_ratelimited(ca, "%s", buf.buf); | 
|  | else | 
|  | bch_err_ratelimited(c, "%s", buf.buf); | 
|  |  | 
|  | printbuf_exit(&buf); | 
|  | bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status); | 
|  | } | 
|  |  | 
|  | static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, | 
|  | struct bch_read_bio *rbio) | 
|  | { | 
|  | struct bch_fs *c = rbio->c; | 
|  | u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; | 
|  | struct bch_extent_crc_unpacked new_crc; | 
|  | struct btree_iter iter; | 
|  | struct bkey_i *new; | 
|  | struct bkey_s_c k; | 
|  | int ret = 0; | 
|  |  | 
|  | if (crc_is_compressed(rbio->pick.crc)) | 
|  | return 0; | 
|  |  | 
|  | k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, | 
|  | BTREE_ITER_slots|BTREE_ITER_intent); | 
|  | if ((ret = bkey_err(k))) | 
|  | goto out; | 
|  |  | 
|  | if (bversion_cmp(k.k->bversion, rbio->version) || | 
|  | !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) | 
|  | goto out; | 
|  |  | 
|  | /* Extent was merged? */ | 
|  | if (bkey_start_offset(k.k) < data_offset || | 
|  | k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) | 
|  | goto out; | 
|  |  | 
|  | if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, | 
|  | rbio->pick.crc, NULL, &new_crc, | 
|  | bkey_start_offset(k.k) - data_offset, k.k->size, | 
|  | rbio->pick.crc.csum_type)) { | 
|  | bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); | 
|  | ret = 0; | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * going to be temporarily appending another checksum entry: | 
|  | */ | 
|  | new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + | 
|  | sizeof(struct bch_extent_crc128)); | 
|  | if ((ret = PTR_ERR_OR_ZERO(new))) | 
|  | goto out; | 
|  |  | 
|  | bkey_reassemble(new, k); | 
|  |  | 
|  | if (!bch2_bkey_narrow_crcs(new, new_crc)) | 
|  | goto out; | 
|  |  | 
|  | ret = bch2_trans_update(trans, &iter, new, | 
|  | BTREE_UPDATE_internal_snapshot_node); | 
|  | out: | 
|  | bch2_trans_iter_exit(trans, &iter); | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) | 
|  | { | 
|  | bch2_trans_commit_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, | 
|  | __bch2_rbio_narrow_crcs(trans, rbio)); | 
|  | } | 
|  |  | 
|  | static void bch2_read_csum_err(struct work_struct *work) | 
|  | { | 
|  | struct bch_read_bio *rbio = | 
|  | container_of(work, struct bch_read_bio, work); | 
|  | struct bch_fs *c	= rbio->c; | 
|  | struct bio *src		= &rbio->bio; | 
|  | struct bch_extent_crc_unpacked crc = rbio->pick.crc; | 
|  | struct nonce nonce = extent_nonce(rbio->version, crc); | 
|  | struct bch_csum csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); | 
|  | struct printbuf buf = PRINTBUF; | 
|  |  | 
|  | bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); | 
|  | prt_str(&buf, "data "); | 
|  | bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); | 
|  |  | 
|  | struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; | 
|  | if (ca) | 
|  | bch_err_ratelimited(ca, "%s", buf.buf); | 
|  | else | 
|  | bch_err_ratelimited(c, "%s", buf.buf); | 
|  |  | 
|  | bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR); | 
|  | printbuf_exit(&buf); | 
|  | } | 
|  |  | 
|  | static void bch2_read_decompress_err(struct work_struct *work) | 
|  | { | 
|  | struct bch_read_bio *rbio = | 
|  | container_of(work, struct bch_read_bio, work); | 
|  | struct bch_fs *c	= rbio->c; | 
|  | struct printbuf buf = PRINTBUF; | 
|  |  | 
|  | bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); | 
|  | prt_str(&buf, "decompression error"); | 
|  |  | 
|  | struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; | 
|  | if (ca) | 
|  | bch_err_ratelimited(ca, "%s", buf.buf); | 
|  | else | 
|  | bch_err_ratelimited(c, "%s", buf.buf); | 
|  |  | 
|  | bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR); | 
|  | printbuf_exit(&buf); | 
|  | } | 
|  |  | 
|  | static void bch2_read_decrypt_err(struct work_struct *work) | 
|  | { | 
|  | struct bch_read_bio *rbio = | 
|  | container_of(work, struct bch_read_bio, work); | 
|  | struct bch_fs *c	= rbio->c; | 
|  | struct printbuf buf = PRINTBUF; | 
|  |  | 
|  | bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); | 
|  | prt_str(&buf, "decrypt error"); | 
|  |  | 
|  | struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; | 
|  | if (ca) | 
|  | bch_err_ratelimited(ca, "%s", buf.buf); | 
|  | else | 
|  | bch_err_ratelimited(c, "%s", buf.buf); | 
|  |  | 
|  | bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR); | 
|  | printbuf_exit(&buf); | 
|  | } | 
|  |  | 
|  | /* Inner part that may run in process context */ | 
|  | static void __bch2_read_endio(struct work_struct *work) | 
|  | { | 
|  | struct bch_read_bio *rbio = | 
|  | container_of(work, struct bch_read_bio, work); | 
|  | struct bch_fs *c	= rbio->c; | 
|  | struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; | 
|  | struct bch_read_bio *parent	= bch2_rbio_parent(rbio); | 
|  | struct bio *src			= &rbio->bio; | 
|  | struct bio *dst			= &parent->bio; | 
|  | struct bvec_iter dst_iter	= rbio->bvec_iter; | 
|  | struct bch_extent_crc_unpacked crc = rbio->pick.crc; | 
|  | struct nonce nonce = extent_nonce(rbio->version, crc); | 
|  | unsigned nofs_flags; | 
|  | struct bch_csum csum; | 
|  | int ret; | 
|  |  | 
|  | nofs_flags = memalloc_nofs_save(); | 
|  |  | 
|  | /* Reset iterator for checksumming and copying bounced data: */ | 
|  | if (rbio->bounce) { | 
|  | src->bi_iter.bi_size		= crc.compressed_size << 9; | 
|  | src->bi_iter.bi_idx		= 0; | 
|  | src->bi_iter.bi_bvec_done	= 0; | 
|  | } else { | 
|  | src->bi_iter			= rbio->bvec_iter; | 
|  | } | 
|  |  | 
|  | bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio); | 
|  |  | 
|  | csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); | 
|  | bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io; | 
|  |  | 
|  | /* | 
|  | * Checksum error: if the bio wasn't bounced, we may have been | 
|  | * reading into buffers owned by userspace (that userspace can | 
|  | * scribble over) - retry the read, bouncing it this time: | 
|  | */ | 
|  | if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) { | 
|  | rbio->flags |= BCH_READ_must_bounce; | 
|  | bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace, | 
|  | BLK_STS_IOERR); | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); | 
|  |  | 
|  | if (!csum_good) | 
|  | goto csum_err; | 
|  |  | 
|  | /* | 
|  | * XXX | 
|  | * We need to rework the narrow_crcs path to deliver the read completion | 
|  | * first, and then punt to a different workqueue, otherwise we're | 
|  | * holding up reads while doing btree updates which is bad for memory | 
|  | * reclaim. | 
|  | */ | 
|  | if (unlikely(rbio->narrow_crcs)) | 
|  | bch2_rbio_narrow_crcs(rbio); | 
|  |  | 
|  | if (likely(!parent->data_update)) { | 
|  | /* Adjust crc to point to subset of data we want: */ | 
|  | crc.offset     += rbio->offset_into_extent; | 
|  | crc.live_size	= bvec_iter_sectors(rbio->bvec_iter); | 
|  |  | 
|  | if (crc_is_compressed(crc)) { | 
|  | ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); | 
|  | if (ret) | 
|  | goto decrypt_err; | 
|  |  | 
|  | if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && | 
|  | !c->opts.no_data_io) | 
|  | goto decompression_err; | 
|  | } else { | 
|  | /* don't need to decrypt the entire bio: */ | 
|  | nonce = nonce_add(nonce, crc.offset << 9); | 
|  | bio_advance(src, crc.offset << 9); | 
|  |  | 
|  | BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); | 
|  | src->bi_iter.bi_size = dst_iter.bi_size; | 
|  |  | 
|  | ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); | 
|  | if (ret) | 
|  | goto decrypt_err; | 
|  |  | 
|  | if (rbio->bounce) { | 
|  | struct bvec_iter src_iter = src->bi_iter; | 
|  |  | 
|  | bio_copy_data_iter(dst, &dst_iter, src, &src_iter); | 
|  | } | 
|  | } | 
|  | } else { | 
|  | if (rbio->split) | 
|  | rbio->parent->pick = rbio->pick; | 
|  |  | 
|  | if (rbio->bounce) { | 
|  | struct bvec_iter src_iter = src->bi_iter; | 
|  |  | 
|  | bio_copy_data_iter(dst, &dst_iter, src, &src_iter); | 
|  | } | 
|  | } | 
|  |  | 
|  | if (rbio->promote) { | 
|  | /* | 
|  | * Re encrypt data we decrypted, so it's consistent with | 
|  | * rbio->crc: | 
|  | */ | 
|  | ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); | 
|  | if (ret) | 
|  | goto decrypt_err; | 
|  | } | 
|  |  | 
|  | if (likely(!(rbio->flags & BCH_READ_in_retry))) { | 
|  | rbio = bch2_rbio_free(rbio); | 
|  | bch2_rbio_done(rbio); | 
|  | } | 
|  | out: | 
|  | memalloc_nofs_restore(nofs_flags); | 
|  | return; | 
|  | csum_err: | 
|  | bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); | 
|  | goto out; | 
|  | decompression_err: | 
|  | bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); | 
|  | goto out; | 
|  | decrypt_err: | 
|  | bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | static void bch2_read_endio(struct bio *bio) | 
|  | { | 
|  | struct bch_read_bio *rbio = | 
|  | container_of(bio, struct bch_read_bio, bio); | 
|  | struct bch_fs *c	= rbio->c; | 
|  | struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; | 
|  | struct workqueue_struct *wq = NULL; | 
|  | enum rbio_context context = RBIO_CONTEXT_NULL; | 
|  |  | 
|  | bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, | 
|  | rbio->submit_time, !bio->bi_status); | 
|  |  | 
|  | if (!rbio->split) | 
|  | rbio->bio.bi_end_io = rbio->end_io; | 
|  |  | 
|  | if (unlikely(bio->bi_status)) { | 
|  | bch2_rbio_punt(rbio, bch2_read_io_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); | 
|  | return; | 
|  | } | 
|  |  | 
|  | if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) || | 
|  | (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) { | 
|  | trace_and_count(c, io_read_reuse_race, &rbio->bio); | 
|  |  | 
|  | if (rbio->flags & BCH_READ_retry_if_stale) | 
|  | bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN); | 
|  | else | 
|  | bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN); | 
|  | return; | 
|  | } | 
|  |  | 
|  | if (rbio->narrow_crcs || | 
|  | rbio->promote || | 
|  | crc_is_compressed(rbio->pick.crc) || | 
|  | bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) | 
|  | context = RBIO_CONTEXT_UNBOUND,	wq = system_unbound_wq; | 
|  | else if (rbio->pick.crc.csum_type) | 
|  | context = RBIO_CONTEXT_HIGHPRI,	wq = system_highpri_wq; | 
|  |  | 
|  | bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); | 
|  | } | 
|  |  | 
|  | static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, | 
|  | struct bch_dev *ca, | 
|  | struct bkey_s_c k, | 
|  | struct bch_extent_ptr ptr) | 
|  | { | 
|  | struct bch_fs *c = trans->c; | 
|  | struct btree_iter iter; | 
|  | struct printbuf buf = PRINTBUF; | 
|  | int ret; | 
|  |  | 
|  | bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, | 
|  | PTR_BUCKET_POS(ca, &ptr), | 
|  | BTREE_ITER_cached); | 
|  |  | 
|  | int gen = bucket_gen_get(ca, iter.pos.offset); | 
|  | if (gen >= 0) { | 
|  | prt_printf(&buf, "Attempting to read from stale dirty pointer:\n"); | 
|  | printbuf_indent_add(&buf, 2); | 
|  |  | 
|  | bch2_bkey_val_to_text(&buf, c, k); | 
|  | prt_newline(&buf); | 
|  |  | 
|  | prt_printf(&buf, "memory gen: %u", gen); | 
|  |  | 
|  | ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(trans, &iter))); | 
|  | if (!ret) { | 
|  | prt_newline(&buf); | 
|  | bch2_bkey_val_to_text(&buf, c, k); | 
|  | } | 
|  | } else { | 
|  | prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n", | 
|  | iter.pos.inode, iter.pos.offset); | 
|  | printbuf_indent_add(&buf, 2); | 
|  |  | 
|  | prt_printf(&buf, "first bucket %u nbuckets %llu\n", | 
|  | ca->mi.first_bucket, ca->mi.nbuckets); | 
|  |  | 
|  | bch2_bkey_val_to_text(&buf, c, k); | 
|  | prt_newline(&buf); | 
|  | } | 
|  |  | 
|  | bch2_fs_inconsistent(c, "%s", buf.buf); | 
|  |  | 
|  | bch2_trans_iter_exit(trans, &iter); | 
|  | printbuf_exit(&buf); | 
|  | } | 
|  |  | 
|  | int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, | 
|  | struct bvec_iter iter, struct bpos read_pos, | 
|  | enum btree_id data_btree, struct bkey_s_c k, | 
|  | unsigned offset_into_extent, | 
|  | struct bch_io_failures *failed, unsigned flags, int dev) | 
|  | { | 
|  | struct bch_fs *c = trans->c; | 
|  | struct extent_ptr_decoded pick; | 
|  | struct bch_read_bio *rbio = NULL; | 
|  | bool bounce = false, read_full = false, narrow_crcs = false; | 
|  | struct bpos data_pos = bkey_start_pos(k.k); | 
|  | struct data_update *u = rbio_data_update(orig); | 
|  | int ret = 0; | 
|  |  | 
|  | if (bkey_extent_is_inline_data(k.k)) { | 
|  | unsigned bytes = min_t(unsigned, iter.bi_size, | 
|  | bkey_inline_data_bytes(k.k)); | 
|  |  | 
|  | swap(iter.bi_size, bytes); | 
|  | memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); | 
|  | swap(iter.bi_size, bytes); | 
|  | bio_advance_iter(&orig->bio, &iter, bytes); | 
|  | zero_fill_bio_iter(&orig->bio, iter); | 
|  | this_cpu_add(c->counters[BCH_COUNTER_io_read_inline], | 
|  | bvec_iter_sectors(iter)); | 
|  | goto out_read_done; | 
|  | } | 
|  | retry_pick: | 
|  | ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); | 
|  |  | 
|  | /* hole or reservation - just zero fill: */ | 
|  | if (!ret) | 
|  | goto hole; | 
|  |  | 
|  | if (unlikely(ret < 0)) { | 
|  | struct printbuf buf = PRINTBUF; | 
|  | bch2_read_err_msg_trans(trans, &buf, orig, read_pos); | 
|  | prt_printf(&buf, "%s\n  ", bch2_err_str(ret)); | 
|  | bch2_bkey_val_to_text(&buf, c, k); | 
|  |  | 
|  | bch_err_ratelimited(c, "%s", buf.buf); | 
|  | printbuf_exit(&buf); | 
|  | goto err; | 
|  | } | 
|  |  | 
|  | if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && !c->chacha20) { | 
|  | struct printbuf buf = PRINTBUF; | 
|  | bch2_read_err_msg_trans(trans, &buf, orig, read_pos); | 
|  | prt_printf(&buf, "attempting to read encrypted data without encryption key\n  "); | 
|  | bch2_bkey_val_to_text(&buf, c, k); | 
|  |  | 
|  | bch_err_ratelimited(c, "%s", buf.buf); | 
|  | printbuf_exit(&buf); | 
|  | ret = -BCH_ERR_data_read_no_encryption_key; | 
|  | goto err; | 
|  | } | 
|  |  | 
|  | struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); | 
|  |  | 
|  | /* | 
|  | * Stale dirty pointers are treated as IO errors, but @failed isn't | 
|  | * allocated unless we're in the retry path - so if we're not in the | 
|  | * retry path, don't check here, it'll be caught in bch2_read_endio() | 
|  | * and we'll end up in the retry path: | 
|  | */ | 
|  | if ((flags & BCH_READ_in_retry) && | 
|  | !pick.ptr.cached && | 
|  | ca && | 
|  | unlikely(dev_ptr_stale(ca, &pick.ptr))) { | 
|  | read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); | 
|  | bch2_mark_io_failure(failed, &pick, false); | 
|  | percpu_ref_put(&ca->io_ref[READ]); | 
|  | goto retry_pick; | 
|  | } | 
|  |  | 
|  | if (likely(!u)) { | 
|  | if (!(flags & BCH_READ_last_fragment) || | 
|  | bio_flagged(&orig->bio, BIO_CHAIN)) | 
|  | flags |= BCH_READ_must_clone; | 
|  |  | 
|  | narrow_crcs = !(flags & BCH_READ_in_retry) && | 
|  | bch2_can_narrow_extent_crcs(k, pick.crc); | 
|  |  | 
|  | if (narrow_crcs && (flags & BCH_READ_user_mapped)) | 
|  | flags |= BCH_READ_must_bounce; | 
|  |  | 
|  | EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); | 
|  |  | 
|  | if (crc_is_compressed(pick.crc) || | 
|  | (pick.crc.csum_type != BCH_CSUM_none && | 
|  | (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || | 
|  | (bch2_csum_type_is_encryption(pick.crc.csum_type) && | 
|  | (flags & BCH_READ_user_mapped)) || | 
|  | (flags & BCH_READ_must_bounce)))) { | 
|  | read_full = true; | 
|  | bounce = true; | 
|  | } | 
|  | } else { | 
|  | /* | 
|  | * can happen if we retry, and the extent we were going to read | 
|  | * has been merged in the meantime: | 
|  | */ | 
|  | if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { | 
|  | if (ca) | 
|  | percpu_ref_put(&ca->io_ref[READ]); | 
|  | rbio->ret = -BCH_ERR_data_read_buffer_too_small; | 
|  | goto out_read_done; | 
|  | } | 
|  |  | 
|  | iter.bi_size	= pick.crc.compressed_size << 9; | 
|  | read_full = true; | 
|  | } | 
|  |  | 
|  | if (orig->opts.promote_target || have_io_error(failed)) | 
|  | rbio = promote_alloc(trans, iter, k, &pick, flags, orig, | 
|  | &bounce, &read_full, failed); | 
|  |  | 
|  | if (!read_full) { | 
|  | EBUG_ON(crc_is_compressed(pick.crc)); | 
|  | EBUG_ON(pick.crc.csum_type && | 
|  | (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || | 
|  | bvec_iter_sectors(iter) != pick.crc.live_size || | 
|  | pick.crc.offset || | 
|  | offset_into_extent)); | 
|  |  | 
|  | data_pos.offset += offset_into_extent; | 
|  | pick.ptr.offset += pick.crc.offset + | 
|  | offset_into_extent; | 
|  | offset_into_extent		= 0; | 
|  | pick.crc.compressed_size	= bvec_iter_sectors(iter); | 
|  | pick.crc.uncompressed_size	= bvec_iter_sectors(iter); | 
|  | pick.crc.offset			= 0; | 
|  | pick.crc.live_size		= bvec_iter_sectors(iter); | 
|  | } | 
|  |  | 
|  | if (rbio) { | 
|  | /* | 
|  | * promote already allocated bounce rbio: | 
|  | * promote needs to allocate a bio big enough for uncompressing | 
|  | * data in the write path, but we're not going to use it all | 
|  | * here: | 
|  | */ | 
|  | EBUG_ON(rbio->bio.bi_iter.bi_size < | 
|  | pick.crc.compressed_size << 9); | 
|  | rbio->bio.bi_iter.bi_size = | 
|  | pick.crc.compressed_size << 9; | 
|  | } else if (bounce) { | 
|  | unsigned sectors = pick.crc.compressed_size; | 
|  |  | 
|  | rbio = rbio_init_fragment(bio_alloc_bioset(NULL, | 
|  | DIV_ROUND_UP(sectors, PAGE_SECTORS), | 
|  | 0, | 
|  | GFP_NOFS, | 
|  | &c->bio_read_split), | 
|  | orig); | 
|  |  | 
|  | bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); | 
|  | rbio->bounce	= true; | 
|  | } else if (flags & BCH_READ_must_clone) { | 
|  | /* | 
|  | * Have to clone if there were any splits, due to error | 
|  | * reporting issues (if a split errored, and retrying didn't | 
|  | * work, when it reports the error to its parent (us) we don't | 
|  | * know if the error was from our bio, and we should retry, or | 
|  | * from the whole bio, in which case we don't want to retry and | 
|  | * lose the error) | 
|  | */ | 
|  | rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, | 
|  | &c->bio_read_split), | 
|  | orig); | 
|  | rbio->bio.bi_iter = iter; | 
|  | } else { | 
|  | rbio = orig; | 
|  | rbio->bio.bi_iter = iter; | 
|  | EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); | 
|  | } | 
|  |  | 
|  | EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); | 
|  |  | 
|  | rbio->submit_time	= local_clock(); | 
|  | if (!rbio->split) | 
|  | rbio->end_io	= orig->bio.bi_end_io; | 
|  | rbio->bvec_iter		= iter; | 
|  | rbio->offset_into_extent= offset_into_extent; | 
|  | rbio->flags		= flags; | 
|  | rbio->have_ioref	= ca != NULL; | 
|  | rbio->narrow_crcs	= narrow_crcs; | 
|  | rbio->ret		= 0; | 
|  | rbio->context		= 0; | 
|  | rbio->pick		= pick; | 
|  | rbio->subvol		= orig->subvol; | 
|  | rbio->read_pos		= read_pos; | 
|  | rbio->data_btree	= data_btree; | 
|  | rbio->data_pos		= data_pos; | 
|  | rbio->version		= k.k->bversion; | 
|  | INIT_WORK(&rbio->work, NULL); | 
|  |  | 
|  | rbio->bio.bi_opf	= orig->bio.bi_opf; | 
|  | rbio->bio.bi_iter.bi_sector = pick.ptr.offset; | 
|  | rbio->bio.bi_end_io	= bch2_read_endio; | 
|  |  | 
|  | if (rbio->bounce) | 
|  | trace_and_count(c, io_read_bounce, &rbio->bio); | 
|  |  | 
|  | if (!u) | 
|  | this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); | 
|  | else | 
|  | this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio)); | 
|  | bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); | 
|  |  | 
|  | /* | 
|  | * If it's being moved internally, we don't want to flag it as a cache | 
|  | * hit: | 
|  | */ | 
|  | if (ca && pick.ptr.cached && !u) | 
|  | bch2_bucket_io_time_reset(trans, pick.ptr.dev, | 
|  | PTR_BUCKET_NR(ca, &pick.ptr), READ); | 
|  |  | 
|  | if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) { | 
|  | bio_inc_remaining(&orig->bio); | 
|  | trace_and_count(c, io_read_split, &orig->bio); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Unlock the iterator while the btree node's lock is still in | 
|  | * cache, before doing the IO: | 
|  | */ | 
|  | if (!(flags & BCH_READ_in_retry)) | 
|  | bch2_trans_unlock(trans); | 
|  | else | 
|  | bch2_trans_unlock_long(trans); | 
|  |  | 
|  | if (likely(!rbio->pick.do_ec_reconstruct)) { | 
|  | if (unlikely(!rbio->have_ioref)) { | 
|  | struct printbuf buf = PRINTBUF; | 
|  | bch2_read_err_msg_trans(trans, &buf, rbio, read_pos); | 
|  | prt_printf(&buf, "no device to read from:\n  "); | 
|  | bch2_bkey_val_to_text(&buf, c, k); | 
|  |  | 
|  | bch_err_ratelimited(c, "%s", buf.buf); | 
|  | printbuf_exit(&buf); | 
|  |  | 
|  | bch2_rbio_error(rbio, | 
|  | -BCH_ERR_data_read_retry_device_offline, | 
|  | BLK_STS_IOERR); | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], | 
|  | bio_sectors(&rbio->bio)); | 
|  | bio_set_dev(&rbio->bio, ca->disk_sb.bdev); | 
|  |  | 
|  | if (unlikely(c->opts.no_data_io)) { | 
|  | if (likely(!(flags & BCH_READ_in_retry))) | 
|  | bio_endio(&rbio->bio); | 
|  | } else { | 
|  | if (likely(!(flags & BCH_READ_in_retry))) | 
|  | submit_bio(&rbio->bio); | 
|  | else | 
|  | submit_bio_wait(&rbio->bio); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * We just submitted IO which may block, we expect relock fail | 
|  | * events and shouldn't count them: | 
|  | */ | 
|  | trans->notrace_relock_fail = true; | 
|  | } else { | 
|  | /* Attempting reconstruct read: */ | 
|  | if (bch2_ec_read_extent(trans, rbio, k)) { | 
|  | bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err, | 
|  | BLK_STS_IOERR); | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | if (likely(!(flags & BCH_READ_in_retry))) | 
|  | bio_endio(&rbio->bio); | 
|  | } | 
|  | out: | 
|  | if (likely(!(flags & BCH_READ_in_retry))) { | 
|  | return 0; | 
|  | } else { | 
|  | bch2_trans_unlock(trans); | 
|  |  | 
|  | int ret; | 
|  |  | 
|  | rbio->context = RBIO_CONTEXT_UNBOUND; | 
|  | bch2_read_endio(&rbio->bio); | 
|  |  | 
|  | ret = rbio->ret; | 
|  | rbio = bch2_rbio_free(rbio); | 
|  |  | 
|  | if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid)) | 
|  | bch2_mark_io_failure(failed, &pick, | 
|  | ret == -BCH_ERR_data_read_retry_csum_err); | 
|  |  | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | err: | 
|  | if (flags & BCH_READ_in_retry) | 
|  | return ret; | 
|  |  | 
|  | orig->bio.bi_status	= BLK_STS_IOERR; | 
|  | orig->ret		= ret; | 
|  | goto out_read_done; | 
|  |  | 
|  | hole: | 
|  | this_cpu_add(c->counters[BCH_COUNTER_io_read_hole], | 
|  | bvec_iter_sectors(iter)); | 
|  | /* | 
|  | * won't normally happen in the data update (bch2_move_extent()) path, | 
|  | * but if we retry and the extent we wanted to read no longer exists we | 
|  | * have to signal that: | 
|  | */ | 
|  | if (u) | 
|  | orig->ret = -BCH_ERR_data_read_key_overwritten; | 
|  |  | 
|  | zero_fill_bio_iter(&orig->bio, iter); | 
|  | out_read_done: | 
|  | if ((flags & BCH_READ_last_fragment) && | 
|  | !(flags & BCH_READ_in_retry)) | 
|  | bch2_rbio_done(orig); | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, | 
|  | struct bvec_iter bvec_iter, subvol_inum inum, | 
|  | struct bch_io_failures *failed, unsigned flags) | 
|  | { | 
|  | struct bch_fs *c = trans->c; | 
|  | struct btree_iter iter; | 
|  | struct bkey_buf sk; | 
|  | struct bkey_s_c k; | 
|  | int ret; | 
|  |  | 
|  | EBUG_ON(rbio->data_update); | 
|  |  | 
|  | bch2_bkey_buf_init(&sk); | 
|  | bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, | 
|  | POS(inum.inum, bvec_iter.bi_sector), | 
|  | BTREE_ITER_slots); | 
|  |  | 
|  | while (1) { | 
|  | enum btree_id data_btree = BTREE_ID_extents; | 
|  |  | 
|  | bch2_trans_begin(trans); | 
|  |  | 
|  | u32 snapshot; | 
|  | ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); | 
|  | if (ret) | 
|  | goto err; | 
|  |  | 
|  | bch2_btree_iter_set_snapshot(trans, &iter, snapshot); | 
|  |  | 
|  | bch2_btree_iter_set_pos(trans, &iter, | 
|  | POS(inum.inum, bvec_iter.bi_sector)); | 
|  |  | 
|  | k = bch2_btree_iter_peek_slot(trans, &iter); | 
|  | ret = bkey_err(k); | 
|  | if (ret) | 
|  | goto err; | 
|  |  | 
|  | s64 offset_into_extent = iter.pos.offset - | 
|  | bkey_start_offset(k.k); | 
|  | unsigned sectors = k.k->size - offset_into_extent; | 
|  |  | 
|  | bch2_bkey_buf_reassemble(&sk, c, k); | 
|  |  | 
|  | ret = bch2_read_indirect_extent(trans, &data_btree, | 
|  | &offset_into_extent, &sk); | 
|  | if (ret) | 
|  | goto err; | 
|  |  | 
|  | k = bkey_i_to_s_c(sk.k); | 
|  |  | 
|  | /* | 
|  | * With indirect extents, the amount of data to read is the min | 
|  | * of the original extent and the indirect extent: | 
|  | */ | 
|  | sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); | 
|  |  | 
|  | unsigned bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; | 
|  | swap(bvec_iter.bi_size, bytes); | 
|  |  | 
|  | if (bvec_iter.bi_size == bytes) | 
|  | flags |= BCH_READ_last_fragment; | 
|  |  | 
|  | ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, | 
|  | data_btree, k, | 
|  | offset_into_extent, failed, flags, -1); | 
|  | swap(bvec_iter.bi_size, bytes); | 
|  |  | 
|  | if (ret) | 
|  | goto err; | 
|  |  | 
|  | if (flags & BCH_READ_last_fragment) | 
|  | break; | 
|  |  | 
|  | bio_advance_iter(&rbio->bio, &bvec_iter, bytes); | 
|  | err: | 
|  | if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace) | 
|  | flags |= BCH_READ_must_bounce; | 
|  |  | 
|  | if (ret && | 
|  | !bch2_err_matches(ret, BCH_ERR_transaction_restart) && | 
|  | !bch2_err_matches(ret, BCH_ERR_data_read_retry)) | 
|  | break; | 
|  | } | 
|  |  | 
|  | bch2_trans_iter_exit(trans, &iter); | 
|  |  | 
|  | if (ret) { | 
|  | struct printbuf buf = PRINTBUF; | 
|  | lockrestart_do(trans, | 
|  | bch2_inum_offset_err_msg_trans(trans, &buf, inum, | 
|  | bvec_iter.bi_sector << 9)); | 
|  | prt_printf(&buf, "read error: %s", bch2_err_str(ret)); | 
|  | bch_err_ratelimited(c, "%s", buf.buf); | 
|  | printbuf_exit(&buf); | 
|  |  | 
|  | rbio->bio.bi_status	= BLK_STS_IOERR; | 
|  | rbio->ret		= ret; | 
|  |  | 
|  | if (!(flags & BCH_READ_in_retry)) | 
|  | bch2_rbio_done(rbio); | 
|  | } | 
|  |  | 
|  | bch2_bkey_buf_exit(&sk, c); | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | void bch2_fs_io_read_exit(struct bch_fs *c) | 
|  | { | 
|  | if (c->promote_table.tbl) | 
|  | rhashtable_destroy(&c->promote_table); | 
|  | bioset_exit(&c->bio_read_split); | 
|  | bioset_exit(&c->bio_read); | 
|  | } | 
|  |  | 
|  | int bch2_fs_io_read_init(struct bch_fs *c) | 
|  | { | 
|  | if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), | 
|  | BIOSET_NEED_BVECS)) | 
|  | return -BCH_ERR_ENOMEM_bio_read_init; | 
|  |  | 
|  | if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), | 
|  | BIOSET_NEED_BVECS)) | 
|  | return -BCH_ERR_ENOMEM_bio_read_split_init; | 
|  |  | 
|  | if (rhashtable_init(&c->promote_table, &bch_promote_params)) | 
|  | return -BCH_ERR_ENOMEM_promote_table_init; | 
|  |  | 
|  | return 0; | 
|  | } |