|  | // SPDX-License-Identifier: GPL-2.0-or-later | 
|  | /* RACK-TLP [RFC8958] Implementation | 
|  | * | 
|  | * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. | 
|  | * Written by David Howells (dhowells@redhat.com) | 
|  | */ | 
|  |  | 
|  | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | 
|  |  | 
|  | #include "ar-internal.h" | 
|  |  | 
|  | static bool rxrpc_rack_sent_after(ktime_t t1, rxrpc_seq_t seq1, | 
|  | ktime_t t2, rxrpc_seq_t seq2) | 
|  | { | 
|  | if (ktime_after(t1, t2)) | 
|  | return true; | 
|  | return t1 == t2 && after(seq1, seq2); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Mark a packet lost. | 
|  | */ | 
|  | static void rxrpc_rack_mark_lost(struct rxrpc_call *call, | 
|  | struct rxrpc_txqueue *tq, unsigned int ix) | 
|  | { | 
|  | if (__test_and_set_bit(ix, &tq->segment_lost)) { | 
|  | if (__test_and_clear_bit(ix, &tq->segment_retransmitted)) | 
|  | call->tx_nr_resent--; | 
|  | } else { | 
|  | call->tx_nr_lost++; | 
|  | } | 
|  | tq->segment_xmit_ts[ix] = UINT_MAX; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Get the transmission time of a packet in the Tx queue. | 
|  | */ | 
|  | static ktime_t rxrpc_get_xmit_ts(const struct rxrpc_txqueue *tq, unsigned int ix) | 
|  | { | 
|  | if (tq->segment_xmit_ts[ix] == UINT_MAX) | 
|  | return KTIME_MAX; | 
|  | return ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[ix]); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Get a bitmask of nack bits for a queue segment and mask off any that aren't | 
|  | * yet reported. | 
|  | */ | 
|  | static unsigned long rxrpc_tq_nacks(const struct rxrpc_txqueue *tq) | 
|  | { | 
|  | unsigned long nacks = ~tq->segment_acked; | 
|  |  | 
|  | if (tq->nr_reported_acks < RXRPC_NR_TXQUEUE) | 
|  | nacks &= (1UL << tq->nr_reported_acks) - 1; | 
|  | return nacks; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Update the RACK state for the most recently sent packet that has been | 
|  | * delivered [RFC8958 6.2 Step 2]. | 
|  | */ | 
|  | static void rxrpc_rack_update(struct rxrpc_call *call, | 
|  | struct rxrpc_ack_summary *summary, | 
|  | struct rxrpc_txqueue *tq, | 
|  | unsigned int ix) | 
|  | { | 
|  | rxrpc_seq_t seq = tq->qbase + ix; | 
|  | ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix); | 
|  | ktime_t rtt = ktime_sub(call->acks_latest_ts, xmit_ts); | 
|  |  | 
|  | if (__test_and_clear_bit(ix, &tq->segment_lost)) | 
|  | call->tx_nr_lost--; | 
|  |  | 
|  | if (test_bit(ix, &tq->segment_retransmitted)) { | 
|  | /* Use Rx.serial instead of TCP.ACK.ts_option.echo_reply. */ | 
|  | if (before(call->acks_highest_serial, tq->segment_serial[ix])) | 
|  | return; | 
|  | if (rtt < minmax_get(&call->min_rtt)) | 
|  | return; | 
|  | } | 
|  |  | 
|  | /* The RACK algorithm requires the segment ACKs to be traversed in | 
|  | * order of segment transmission - but the only thing this seems to | 
|  | * matter for is that RACK.rtt is set to the rtt of the most recently | 
|  | * transmitted segment.  We should be able to achieve the same by only | 
|  | * setting RACK.rtt if the xmit time is greater. | 
|  | */ | 
|  | if (ktime_after(xmit_ts, call->rack_rtt_ts)) { | 
|  | call->rack_rtt	  = rtt; | 
|  | call->rack_rtt_ts = xmit_ts; | 
|  | } | 
|  |  | 
|  | if (rxrpc_rack_sent_after(xmit_ts, seq, call->rack_xmit_ts, call->rack_end_seq)) { | 
|  | call->rack_rtt = rtt; | 
|  | call->rack_xmit_ts = xmit_ts; | 
|  | call->rack_end_seq = seq; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Detect data segment reordering [RFC8958 6.2 Step 3]. | 
|  | */ | 
|  | static void rxrpc_rack_detect_reordering(struct rxrpc_call *call, | 
|  | struct rxrpc_ack_summary *summary, | 
|  | struct rxrpc_txqueue *tq, | 
|  | unsigned int ix) | 
|  | { | 
|  | rxrpc_seq_t seq = tq->qbase + ix; | 
|  |  | 
|  | /* Track the highest sequence number so far ACK'd.  This is not | 
|  | * necessarily the same as ack.firstPacket + ack.nAcks - 1 as the peer | 
|  | * could put a NACK in the last SACK slot. | 
|  | */ | 
|  | if (after(seq, call->rack_fack)) | 
|  | call->rack_fack = seq; | 
|  | else if (before(seq, call->rack_fack) && | 
|  | test_bit(ix, &tq->segment_retransmitted)) | 
|  | call->rack_reordering_seen = true; | 
|  | } | 
|  |  | 
|  | void rxrpc_input_rack_one(struct rxrpc_call *call, | 
|  | struct rxrpc_ack_summary *summary, | 
|  | struct rxrpc_txqueue *tq, | 
|  | unsigned int ix) | 
|  | { | 
|  | rxrpc_rack_update(call, summary, tq, ix); | 
|  | rxrpc_rack_detect_reordering(call, summary, tq, ix); | 
|  | } | 
|  |  | 
|  | void rxrpc_input_rack(struct rxrpc_call *call, | 
|  | struct rxrpc_ack_summary *summary, | 
|  | struct rxrpc_txqueue *tq, | 
|  | unsigned long new_acks) | 
|  | { | 
|  | while (new_acks) { | 
|  | unsigned int ix = __ffs(new_acks); | 
|  |  | 
|  | __clear_bit(ix, &new_acks); | 
|  | rxrpc_input_rack_one(call, summary, tq, ix); | 
|  | } | 
|  |  | 
|  | trace_rxrpc_rack_update(call, summary); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Update the reordering window [RFC8958 6.2 Step 4].  Returns the updated | 
|  | * duration of the reordering window. | 
|  | * | 
|  | * Note that the Rx protocol doesn't have a 'DSACK option' per se, but ACKs can | 
|  | * be given a 'DUPLICATE' reason with the serial number referring to the | 
|  | * duplicated DATA packet.  Rx does not inform as to whether this was a | 
|  | * reception of the same packet twice or of a retransmission of a packet we | 
|  | * already received (though this could be determined by the transmitter based | 
|  | * on the serial number). | 
|  | */ | 
|  | static ktime_t rxrpc_rack_update_reo_wnd(struct rxrpc_call *call, | 
|  | struct rxrpc_ack_summary *summary) | 
|  | { | 
|  | rxrpc_seq_t snd_una = call->acks_lowest_nak; /* Lowest unack'd seq */ | 
|  | rxrpc_seq_t snd_nxt = call->tx_transmitted + 1; /* Next seq to be sent */ | 
|  | bool have_dsack_option = summary->ack_reason == RXRPC_ACK_DUPLICATE; | 
|  | int dup_thresh = 3; | 
|  |  | 
|  | /* DSACK-based reordering window adaptation */ | 
|  | if (!call->rack_dsack_round_none && | 
|  | after_eq(snd_una, call->rack_dsack_round)) | 
|  | call->rack_dsack_round_none = true; | 
|  |  | 
|  | /* Grow the reordering window per round that sees DSACK.  Reset the | 
|  | * window after 16 DSACK-free recoveries. | 
|  | */ | 
|  | if (call->rack_dsack_round_none && have_dsack_option) { | 
|  | call->rack_dsack_round_none = false; | 
|  | call->rack_dsack_round = snd_nxt; | 
|  | call->rack_reo_wnd_mult++; | 
|  | call->rack_reo_wnd_persist = 16; | 
|  | } else if (summary->exiting_fast_or_rto_recovery) { | 
|  | call->rack_reo_wnd_persist--; | 
|  | if (call->rack_reo_wnd_persist <= 0) | 
|  | call->rack_reo_wnd_mult = 1; | 
|  | } | 
|  |  | 
|  | if (!call->rack_reordering_seen) { | 
|  | if (summary->in_fast_or_rto_recovery) | 
|  | return 0; | 
|  | if (call->acks_nr_sacks >= dup_thresh) | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | return us_to_ktime(umin(call->rack_reo_wnd_mult * minmax_get(&call->min_rtt) / 4, | 
|  | call->srtt_us >> 3)); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Detect losses [RFC8958 6.2 Step 5]. | 
|  | */ | 
|  | static ktime_t rxrpc_rack_detect_loss(struct rxrpc_call *call, | 
|  | struct rxrpc_ack_summary *summary) | 
|  | { | 
|  | struct rxrpc_txqueue *tq; | 
|  | ktime_t timeout = 0, lost_after, now = ktime_get_real(); | 
|  |  | 
|  | call->rack_reo_wnd = rxrpc_rack_update_reo_wnd(call, summary); | 
|  | lost_after = ktime_add(call->rack_rtt, call->rack_reo_wnd); | 
|  | trace_rxrpc_rack_scan_loss(call); | 
|  |  | 
|  | for (tq = call->tx_queue; tq; tq = tq->next) { | 
|  | unsigned long nacks = rxrpc_tq_nacks(tq); | 
|  |  | 
|  | if (after(tq->qbase, call->tx_transmitted)) | 
|  | break; | 
|  | trace_rxrpc_rack_scan_loss_tq(call, tq, nacks); | 
|  |  | 
|  | /* Skip ones marked lost but not yet retransmitted */ | 
|  | nacks &= ~tq->segment_lost | tq->segment_retransmitted; | 
|  |  | 
|  | while (nacks) { | 
|  | unsigned int ix = __ffs(nacks); | 
|  | rxrpc_seq_t seq = tq->qbase + ix; | 
|  | ktime_t remaining; | 
|  | ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix); | 
|  |  | 
|  | __clear_bit(ix, &nacks); | 
|  |  | 
|  | if (rxrpc_rack_sent_after(call->rack_xmit_ts, call->rack_end_seq, | 
|  | xmit_ts, seq)) { | 
|  | remaining = ktime_sub(ktime_add(xmit_ts, lost_after), now); | 
|  | if (remaining <= 0) { | 
|  | rxrpc_rack_mark_lost(call, tq, ix); | 
|  | trace_rxrpc_rack_detect_loss(call, summary, seq); | 
|  | } else { | 
|  | timeout = max(remaining, timeout); | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | return timeout; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Detect losses and set a timer to retry the detection [RFC8958 6.2 Step 5]. | 
|  | */ | 
|  | void rxrpc_rack_detect_loss_and_arm_timer(struct rxrpc_call *call, | 
|  | struct rxrpc_ack_summary *summary) | 
|  | { | 
|  | ktime_t timeout = rxrpc_rack_detect_loss(call, summary); | 
|  |  | 
|  | if (timeout) { | 
|  | call->rack_timer_mode = RXRPC_CALL_RACKTIMER_RACK_REORDER; | 
|  | call->rack_timo_at = ktime_add(ktime_get_real(), timeout); | 
|  | trace_rxrpc_rack_timer(call, timeout, false); | 
|  | trace_rxrpc_timer_set(call, timeout, rxrpc_timer_trace_rack_reo); | 
|  | } | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Handle RACK-TLP RTO expiration [RFC8958 6.3]. | 
|  | */ | 
|  | static void rxrpc_rack_mark_losses_on_rto(struct rxrpc_call *call) | 
|  | { | 
|  | struct rxrpc_txqueue *tq; | 
|  | rxrpc_seq_t snd_una = call->acks_lowest_nak; /* Lowest unack'd seq */ | 
|  | ktime_t lost_after = ktime_add(call->rack_rtt, call->rack_reo_wnd); | 
|  | ktime_t deadline = ktime_sub(ktime_get_real(), lost_after); | 
|  |  | 
|  | for (tq = call->tx_queue; tq; tq = tq->next) { | 
|  | unsigned long unacked = ~tq->segment_acked; | 
|  |  | 
|  | trace_rxrpc_rack_mark_loss_tq(call, tq); | 
|  | while (unacked) { | 
|  | unsigned int ix = __ffs(unacked); | 
|  | rxrpc_seq_t seq = tq->qbase + ix; | 
|  | ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix); | 
|  |  | 
|  | if (after(seq, call->tx_transmitted)) | 
|  | return; | 
|  | __clear_bit(ix, &unacked); | 
|  |  | 
|  | if (seq == snd_una || | 
|  | ktime_before(xmit_ts, deadline)) | 
|  | rxrpc_rack_mark_lost(call, tq, ix); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Calculate the TLP loss probe timeout (PTO) [RFC8958 7.2]. | 
|  | */ | 
|  | ktime_t rxrpc_tlp_calc_pto(struct rxrpc_call *call, ktime_t now) | 
|  | { | 
|  | unsigned int flight_size = rxrpc_tx_in_flight(call); | 
|  | ktime_t rto_at = ktime_add(call->tx_last_sent, | 
|  | rxrpc_get_rto_backoff(call, false)); | 
|  | ktime_t pto; | 
|  |  | 
|  | if (call->rtt_count > 0) { | 
|  | /* Use 2*SRTT as the timeout. */ | 
|  | pto = ns_to_ktime(call->srtt_us * NSEC_PER_USEC / 4); | 
|  | if (flight_size) | 
|  | pto = ktime_add(pto, call->tlp_max_ack_delay); | 
|  | } else { | 
|  | pto = NSEC_PER_SEC; | 
|  | } | 
|  |  | 
|  | if (ktime_after(ktime_add(now, pto), rto_at)) | 
|  | pto = ktime_sub(rto_at, now); | 
|  | return pto; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Send a TLP loss probe on PTO expiration [RFC8958 7.3]. | 
|  | */ | 
|  | void rxrpc_tlp_send_probe(struct rxrpc_call *call) | 
|  | { | 
|  | unsigned int in_flight = rxrpc_tx_in_flight(call); | 
|  |  | 
|  | if (after_eq(call->acks_hard_ack, call->tx_transmitted)) | 
|  | return; /* Everything we transmitted has been acked. */ | 
|  |  | 
|  | /* There must be no other loss probe still in flight and we need to | 
|  | * have taken a new RTT sample since last probe or the start of | 
|  | * connection. | 
|  | */ | 
|  | if (!call->tlp_serial && | 
|  | call->tlp_rtt_taken != call->rtt_taken) { | 
|  | call->tlp_is_retrans = false; | 
|  | if (after(call->send_top, call->tx_transmitted) && | 
|  | rxrpc_tx_window_space(call) > 0) { | 
|  | /* Transmit the lowest-sequence unsent DATA */ | 
|  | call->tx_last_serial = 0; | 
|  | rxrpc_transmit_some_data(call, 1, rxrpc_txdata_tlp_new_data); | 
|  | call->tlp_serial = call->tx_last_serial; | 
|  | call->tlp_seq = call->tx_transmitted; | 
|  | trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_transmit_new); | 
|  | in_flight = rxrpc_tx_in_flight(call); | 
|  | } else { | 
|  | /* Retransmit the highest-sequence DATA sent */ | 
|  | call->tx_last_serial = 0; | 
|  | rxrpc_resend_tlp(call); | 
|  | call->tlp_is_retrans = true; | 
|  | trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_retransmit); | 
|  | } | 
|  | } else { | 
|  | trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_busy); | 
|  | } | 
|  |  | 
|  | if (in_flight != 0) { | 
|  | ktime_t rto = rxrpc_get_rto_backoff(call, false); | 
|  |  | 
|  | call->rack_timer_mode = RXRPC_CALL_RACKTIMER_RTO; | 
|  | call->rack_timo_at = ktime_add(ktime_get_real(), rto); | 
|  | trace_rxrpc_rack_timer(call, rto, false); | 
|  | trace_rxrpc_timer_set(call, rto, rxrpc_timer_trace_rack_rto); | 
|  | } | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Detect losses using the ACK of a TLP loss probe [RFC8958 7.4]. | 
|  | */ | 
|  | void rxrpc_tlp_process_ack(struct rxrpc_call *call, struct rxrpc_ack_summary *summary) | 
|  | { | 
|  | if (!call->tlp_serial || after(call->tlp_seq, call->acks_hard_ack)) | 
|  | return; | 
|  |  | 
|  | if (!call->tlp_is_retrans) { | 
|  | /* TLP of new data delivered */ | 
|  | trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_new_data); | 
|  | call->tlp_serial = 0; | 
|  | } else if (summary->ack_reason == RXRPC_ACK_DUPLICATE && | 
|  | summary->acked_serial == call->tlp_serial) { | 
|  | /* General Case: Detected packet losses using RACK [7.4.1] */ | 
|  | trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_dup_acked); | 
|  | call->tlp_serial = 0; | 
|  | } else if (after(call->acks_hard_ack, call->tlp_seq)) { | 
|  | /* Repaired the single loss */ | 
|  | trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_hard_beyond); | 
|  | call->tlp_serial = 0; | 
|  | // TODO: Invoke congestion control to react to the loss | 
|  | // event the probe has repaired | 
|  | } else if (summary->tlp_probe_acked) { | 
|  | trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_acked); | 
|  | /* Special Case: Detected a single loss repaired by the loss | 
|  | * probe [7.4.2] | 
|  | */ | 
|  | call->tlp_serial = 0; | 
|  | } else { | 
|  | trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_incomplete); | 
|  | } | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Handle RACK timer expiration; returns true to request a resend. | 
|  | */ | 
|  | void rxrpc_rack_timer_expired(struct rxrpc_call *call, ktime_t overran_by) | 
|  | { | 
|  | struct rxrpc_ack_summary summary = {}; | 
|  | enum rxrpc_rack_timer_mode mode = call->rack_timer_mode; | 
|  |  | 
|  | trace_rxrpc_rack_timer(call, overran_by, true); | 
|  | call->rack_timer_mode = RXRPC_CALL_RACKTIMER_OFF; | 
|  |  | 
|  | switch (mode) { | 
|  | case RXRPC_CALL_RACKTIMER_RACK_REORDER: | 
|  | rxrpc_rack_detect_loss_and_arm_timer(call, &summary); | 
|  | break; | 
|  | case RXRPC_CALL_RACKTIMER_TLP_PTO: | 
|  | rxrpc_tlp_send_probe(call); | 
|  | break; | 
|  | case RXRPC_CALL_RACKTIMER_RTO: | 
|  | // Might need to poke the congestion algo in some way | 
|  | rxrpc_rack_mark_losses_on_rto(call); | 
|  | break; | 
|  | //case RXRPC_CALL_RACKTIMER_ZEROWIN: | 
|  | default: | 
|  | pr_warn("Unexpected rack timer %u", call->rack_timer_mode); | 
|  | } | 
|  | } |