|  | /* SPDX-License-Identifier: GPL-2.0-only */ | 
|  | /* | 
|  | * Copyright 2023 Red Hat | 
|  | */ | 
|  |  | 
|  | #ifndef VDO_BLOCK_MAP_H | 
|  | #define VDO_BLOCK_MAP_H | 
|  |  | 
|  | #include <linux/list.h> | 
|  |  | 
|  | #include "numeric.h" | 
|  |  | 
|  | #include "admin-state.h" | 
|  | #include "completion.h" | 
|  | #include "encodings.h" | 
|  | #include "int-map.h" | 
|  | #include "statistics.h" | 
|  | #include "types.h" | 
|  | #include "vio.h" | 
|  | #include "wait-queue.h" | 
|  |  | 
|  | /* | 
|  | * The block map is responsible for tracking all the logical to physical mappings of a VDO. It | 
|  | * consists of a collection of 60 radix trees gradually allocated as logical addresses are used. | 
|  | * Each tree is assigned to a logical zone such that it is easy to compute which zone must handle | 
|  | * each logical address. Each logical zone also has a dedicated portion of the leaf page cache. | 
|  | * | 
|  | * Each logical zone has a single dedicated queue and thread for performing all updates to the | 
|  | * radix trees assigned to that zone. The concurrency guarantees of this single-threaded model | 
|  | * allow the code to omit more fine-grained locking for the block map structures. | 
|  | * | 
|  | * Load operations must be performed on the admin thread. Normal operations, such as reading and | 
|  | * updating mappings, must be performed on the appropriate logical zone thread. Save operations | 
|  | * must be launched from the same admin thread as the original load operation. | 
|  | */ | 
|  |  | 
|  | enum { | 
|  | BLOCK_MAP_VIO_POOL_SIZE = 64, | 
|  | }; | 
|  |  | 
|  | /* | 
|  | * Generation counter for page references. | 
|  | */ | 
|  | typedef u32 vdo_page_generation; | 
|  |  | 
|  | extern const struct block_map_entry UNMAPPED_BLOCK_MAP_ENTRY; | 
|  |  | 
|  | /* The VDO Page Cache abstraction. */ | 
|  | struct vdo_page_cache { | 
|  | /* the VDO which owns this cache */ | 
|  | struct vdo *vdo; | 
|  | /* number of pages in cache */ | 
|  | page_count_t page_count; | 
|  | /* number of pages to write in the current batch */ | 
|  | page_count_t pages_in_batch; | 
|  | /* Whether the VDO is doing a read-only rebuild */ | 
|  | bool rebuilding; | 
|  |  | 
|  | /* array of page information entries */ | 
|  | struct page_info *infos; | 
|  | /* raw memory for pages */ | 
|  | char *pages; | 
|  | /* cache last found page info */ | 
|  | struct page_info *last_found; | 
|  | /* map of page number to info */ | 
|  | struct int_map *page_map; | 
|  | /* main LRU list (all infos) */ | 
|  | struct list_head lru_list; | 
|  | /* free page list (oldest first) */ | 
|  | struct list_head free_list; | 
|  | /* outgoing page list */ | 
|  | struct list_head outgoing_list; | 
|  | /* number of read I/O operations pending */ | 
|  | page_count_t outstanding_reads; | 
|  | /* number of write I/O operations pending */ | 
|  | page_count_t outstanding_writes; | 
|  | /* number of pages covered by the current flush */ | 
|  | page_count_t pages_in_flush; | 
|  | /* number of pages waiting to be included in the next flush */ | 
|  | page_count_t pages_to_flush; | 
|  | /* number of discards in progress */ | 
|  | unsigned int discard_count; | 
|  | /* how many VPCs waiting for free page */ | 
|  | unsigned int waiter_count; | 
|  | /* queue of waiters who want a free page */ | 
|  | struct vdo_wait_queue free_waiters; | 
|  | /* | 
|  | * Statistics are only updated on the logical zone thread, but are accessed from other | 
|  | * threads. | 
|  | */ | 
|  | struct block_map_statistics stats; | 
|  | /* counter for pressure reports */ | 
|  | u32 pressure_report; | 
|  | /* the block map zone to which this cache belongs */ | 
|  | struct block_map_zone *zone; | 
|  | }; | 
|  |  | 
|  | /* | 
|  | * The state of a page buffer. If the page buffer is free no particular page is bound to it, | 
|  | * otherwise the page buffer is bound to particular page whose absolute pbn is in the pbn field. If | 
|  | * the page is resident or dirty the page data is stable and may be accessed. Otherwise the page is | 
|  | * in flight (incoming or outgoing) and its data should not be accessed. | 
|  | * | 
|  | * @note Update the static data in get_page_state_name() if you change this enumeration. | 
|  | */ | 
|  | enum vdo_page_buffer_state { | 
|  | /* this page buffer is not being used */ | 
|  | PS_FREE, | 
|  | /* this page is being read from store */ | 
|  | PS_INCOMING, | 
|  | /* attempt to load this page failed */ | 
|  | PS_FAILED, | 
|  | /* this page is valid and un-modified */ | 
|  | PS_RESIDENT, | 
|  | /* this page is valid and modified */ | 
|  | PS_DIRTY, | 
|  | /* this page is being written and should not be used */ | 
|  | PS_OUTGOING, | 
|  | /* not a state */ | 
|  | PAGE_STATE_COUNT, | 
|  | } __packed; | 
|  |  | 
|  | /* | 
|  | * The write status of page | 
|  | */ | 
|  | enum vdo_page_write_status { | 
|  | WRITE_STATUS_NORMAL, | 
|  | WRITE_STATUS_DISCARD, | 
|  | WRITE_STATUS_DEFERRED, | 
|  | } __packed; | 
|  |  | 
|  | /* Per-page-slot information. */ | 
|  | struct page_info { | 
|  | /* Preallocated page struct vio */ | 
|  | struct vio *vio; | 
|  | /* back-link for references */ | 
|  | struct vdo_page_cache *cache; | 
|  | /* the pbn of the page */ | 
|  | physical_block_number_t pbn; | 
|  | /* page is busy (temporarily locked) */ | 
|  | u16 busy; | 
|  | /* the write status the page */ | 
|  | enum vdo_page_write_status write_status; | 
|  | /* page state */ | 
|  | enum vdo_page_buffer_state state; | 
|  | /* queue of completions awaiting this item */ | 
|  | struct vdo_wait_queue waiting; | 
|  | /* state linked list entry */ | 
|  | struct list_head state_entry; | 
|  | /* LRU entry */ | 
|  | struct list_head lru_entry; | 
|  | /* | 
|  | * The earliest recovery journal block containing uncommitted updates to the block map page | 
|  | * associated with this page_info. A reference (lock) is held on that block to prevent it | 
|  | * from being reaped. When this value changes, the reference on the old value must be | 
|  | * released and a reference on the new value must be acquired. | 
|  | */ | 
|  | sequence_number_t recovery_lock; | 
|  | }; | 
|  |  | 
|  | /* | 
|  | * A completion awaiting a specific page. Also a live reference into the page once completed, until | 
|  | * freed. | 
|  | */ | 
|  | struct vdo_page_completion { | 
|  | /* The generic completion */ | 
|  | struct vdo_completion completion; | 
|  | /* The cache involved */ | 
|  | struct vdo_page_cache *cache; | 
|  | /* The waiter for the pending list */ | 
|  | struct vdo_waiter waiter; | 
|  | /* The absolute physical block number of the page on disk */ | 
|  | physical_block_number_t pbn; | 
|  | /* Whether the page may be modified */ | 
|  | bool writable; | 
|  | /* Whether the page is available */ | 
|  | bool ready; | 
|  | /* The info structure for the page, only valid when ready */ | 
|  | struct page_info *info; | 
|  | }; | 
|  |  | 
|  | struct forest; | 
|  |  | 
|  | struct tree_page { | 
|  | struct vdo_waiter waiter; | 
|  |  | 
|  | /* Dirty list entry */ | 
|  | struct list_head entry; | 
|  |  | 
|  | /* If dirty, the tree zone flush generation in which it was last dirtied. */ | 
|  | u8 generation; | 
|  |  | 
|  | /* Whether this page is an interior tree page being written out. */ | 
|  | bool writing; | 
|  |  | 
|  | /* If writing, the tree zone flush generation of the copy being written. */ | 
|  | u8 writing_generation; | 
|  |  | 
|  | /* | 
|  | * Sequence number of the earliest recovery journal block containing uncommitted updates to | 
|  | * this page | 
|  | */ | 
|  | sequence_number_t recovery_lock; | 
|  |  | 
|  | /* The value of recovery_lock when the this page last started writing */ | 
|  | sequence_number_t writing_recovery_lock; | 
|  |  | 
|  | char page_buffer[VDO_BLOCK_SIZE]; | 
|  | }; | 
|  |  | 
|  | enum block_map_page_type { | 
|  | VDO_TREE_PAGE, | 
|  | VDO_CACHE_PAGE, | 
|  | }; | 
|  |  | 
|  | typedef struct list_head dirty_era_t[2]; | 
|  |  | 
|  | struct dirty_lists { | 
|  | /* The number of periods after which an element will be expired */ | 
|  | block_count_t maximum_age; | 
|  | /* The oldest period which has unexpired elements */ | 
|  | sequence_number_t oldest_period; | 
|  | /* One more than the current period */ | 
|  | sequence_number_t next_period; | 
|  | /* The offset in the array of lists of the oldest period */ | 
|  | block_count_t offset; | 
|  | /* Expired pages */ | 
|  | dirty_era_t expired; | 
|  | /* The lists of dirty pages */ | 
|  | dirty_era_t eras[]; | 
|  | }; | 
|  |  | 
|  | struct block_map_zone { | 
|  | zone_count_t zone_number; | 
|  | thread_id_t thread_id; | 
|  | struct admin_state state; | 
|  | struct block_map *block_map; | 
|  | /* Dirty pages, by era*/ | 
|  | struct dirty_lists *dirty_lists; | 
|  | struct vdo_page_cache page_cache; | 
|  | data_vio_count_t active_lookups; | 
|  | struct int_map *loading_pages; | 
|  | struct vio_pool *vio_pool; | 
|  | /* The tree page which has issued or will be issuing a flush */ | 
|  | struct tree_page *flusher; | 
|  | struct vdo_wait_queue flush_waiters; | 
|  | /* The generation after the most recent flush */ | 
|  | u8 generation; | 
|  | u8 oldest_generation; | 
|  | /* The counts of dirty pages in each generation */ | 
|  | u32 dirty_page_counts[256]; | 
|  | }; | 
|  |  | 
|  | struct block_map { | 
|  | struct vdo *vdo; | 
|  | struct action_manager *action_manager; | 
|  | /* The absolute PBN of the first root of the tree part of the block map */ | 
|  | physical_block_number_t root_origin; | 
|  | block_count_t root_count; | 
|  |  | 
|  | /* The era point we are currently distributing to the zones */ | 
|  | sequence_number_t current_era_point; | 
|  | /* The next era point */ | 
|  | sequence_number_t pending_era_point; | 
|  |  | 
|  | /* The number of entries in block map */ | 
|  | block_count_t entry_count; | 
|  | nonce_t nonce; | 
|  | struct recovery_journal *journal; | 
|  |  | 
|  | /* The trees for finding block map pages */ | 
|  | struct forest *forest; | 
|  | /* The expanded trees awaiting growth */ | 
|  | struct forest *next_forest; | 
|  | /* The number of entries after growth */ | 
|  | block_count_t next_entry_count; | 
|  |  | 
|  | zone_count_t zone_count; | 
|  | struct block_map_zone zones[]; | 
|  | }; | 
|  |  | 
|  | /** | 
|  | * typedef vdo_entry_callback_fn - A function to be called for each allocated PBN when traversing | 
|  | *                                 the forest. | 
|  | * @pbn: A PBN of a tree node. | 
|  | * @completion: The parent completion of the traversal. | 
|  | * | 
|  | * Return: VDO_SUCCESS or an error. | 
|  | */ | 
|  | typedef int (*vdo_entry_callback_fn)(physical_block_number_t pbn, | 
|  | struct vdo_completion *completion); | 
|  |  | 
|  | static inline struct vdo_page_completion *as_vdo_page_completion(struct vdo_completion *completion) | 
|  | { | 
|  | vdo_assert_completion_type(completion, VDO_PAGE_COMPLETION); | 
|  | return container_of(completion, struct vdo_page_completion, completion); | 
|  | } | 
|  |  | 
|  | void vdo_release_page_completion(struct vdo_completion *completion); | 
|  |  | 
|  | void vdo_get_page(struct vdo_page_completion *page_completion, | 
|  | struct block_map_zone *zone, physical_block_number_t pbn, | 
|  | bool writable, void *parent, vdo_action_fn callback, | 
|  | vdo_action_fn error_handler, bool requeue); | 
|  |  | 
|  | void vdo_request_page_write(struct vdo_completion *completion); | 
|  |  | 
|  | int __must_check vdo_get_cached_page(struct vdo_completion *completion, | 
|  | struct block_map_page **page_ptr); | 
|  |  | 
|  | int __must_check vdo_invalidate_page_cache(struct vdo_page_cache *cache); | 
|  |  | 
|  | static inline struct block_map_page * __must_check | 
|  | vdo_as_block_map_page(struct tree_page *tree_page) | 
|  | { | 
|  | return (struct block_map_page *) tree_page->page_buffer; | 
|  | } | 
|  |  | 
|  | bool vdo_copy_valid_page(char *buffer, nonce_t nonce, | 
|  | physical_block_number_t pbn, | 
|  | struct block_map_page *page); | 
|  |  | 
|  | void vdo_find_block_map_slot(struct data_vio *data_vio); | 
|  |  | 
|  | physical_block_number_t vdo_find_block_map_page_pbn(struct block_map *map, | 
|  | page_number_t page_number); | 
|  |  | 
|  | void vdo_write_tree_page(struct tree_page *page, struct block_map_zone *zone); | 
|  |  | 
|  | void vdo_traverse_forest(struct block_map *map, vdo_entry_callback_fn callback, | 
|  | struct vdo_completion *completion); | 
|  |  | 
|  | int __must_check vdo_decode_block_map(struct block_map_state_2_0 state, | 
|  | block_count_t logical_blocks, struct vdo *vdo, | 
|  | struct recovery_journal *journal, nonce_t nonce, | 
|  | page_count_t cache_size, block_count_t maximum_age, | 
|  | struct block_map **map_ptr); | 
|  |  | 
|  | void vdo_drain_block_map(struct block_map *map, const struct admin_state_code *operation, | 
|  | struct vdo_completion *parent); | 
|  |  | 
|  | void vdo_resume_block_map(struct block_map *map, struct vdo_completion *parent); | 
|  |  | 
|  | int __must_check vdo_prepare_to_grow_block_map(struct block_map *map, | 
|  | block_count_t new_logical_blocks); | 
|  |  | 
|  | void vdo_grow_block_map(struct block_map *map, struct vdo_completion *parent); | 
|  |  | 
|  | void vdo_abandon_block_map_growth(struct block_map *map); | 
|  |  | 
|  | void vdo_free_block_map(struct block_map *map); | 
|  |  | 
|  | struct block_map_state_2_0 __must_check vdo_record_block_map(const struct block_map *map); | 
|  |  | 
|  | void vdo_initialize_block_map_from_journal(struct block_map *map, | 
|  | struct recovery_journal *journal); | 
|  |  | 
|  | zone_count_t vdo_compute_logical_zone(struct data_vio *data_vio); | 
|  |  | 
|  | void vdo_advance_block_map_era(struct block_map *map, | 
|  | sequence_number_t recovery_block_number); | 
|  |  | 
|  | void vdo_update_block_map_page(struct block_map_page *page, struct data_vio *data_vio, | 
|  | physical_block_number_t pbn, | 
|  | enum block_mapping_state mapping_state, | 
|  | sequence_number_t *recovery_lock); | 
|  |  | 
|  | void vdo_get_mapped_block(struct data_vio *data_vio); | 
|  |  | 
|  | void vdo_put_mapped_block(struct data_vio *data_vio); | 
|  |  | 
|  | struct block_map_statistics __must_check vdo_get_block_map_statistics(struct block_map *map); | 
|  |  | 
|  | /** | 
|  | * vdo_convert_maximum_age() - Convert the maximum age to reflect the new recovery journal format | 
|  | * @age: The configured maximum age | 
|  | * | 
|  | * Return: The converted age | 
|  | * | 
|  | * In the old recovery journal format, each journal block held 311 entries, and every write bio | 
|  | * made two entries. The old maximum age was half the usable journal length. In the new format, | 
|  | * each block holds only 217 entries, but each bio only makes one entry. We convert the configured | 
|  | * age so that the number of writes in a block map era is the same in the old and new formats. This | 
|  | * keeps the bound on the amount of work required to recover the block map from the recovery | 
|  | * journal the same across the format change. It also keeps the amortization of block map page | 
|  | * writes to write bios the same. | 
|  | */ | 
|  | static inline block_count_t vdo_convert_maximum_age(block_count_t age) | 
|  | { | 
|  | return DIV_ROUND_UP(age * RECOVERY_JOURNAL_1_ENTRIES_PER_BLOCK, | 
|  | 2 * RECOVERY_JOURNAL_ENTRIES_PER_BLOCK); | 
|  | } | 
|  |  | 
|  | #endif /* VDO_BLOCK_MAP_H */ |