| #!/usr/bin/env python3 |
| # SPDX-License-Identifier: GPL-2.0 |
| |
| """ |
| Devlink Rate TC Bandwidth Test Suite |
| =================================== |
| |
| This test suite verifies the functionality of devlink-rate traffic class (TC) |
| bandwidth distribution in a virtualized environment. The tests validate that |
| bandwidth can be properly allocated between different traffic classes and |
| that TC mapping works as expected. |
| |
| Test Environment: |
| ---------------- |
| - Creates 1 VF |
| - Establishes a bridge connecting the VF representor and the uplink representor |
| - Sets up 2 VLAN interfaces on the VF with different VLAN IDs (101, 102) |
| - Configures different traffic classes (TC3 and TC4) for each VLAN |
| |
| Test Cases: |
| ---------- |
| 1. test_no_tc_mapping_bandwidth: |
| - Verifies that without TC mapping, bandwidth is NOT distributed according to |
| the configured 80/20 split between TC4 and TC3 |
| - This test should fail if bandwidth matches the 80/20 split without TC |
| mapping |
| - Expected: Bandwidth should NOT be distributed as 80/20 |
| |
| 2. test_tc_mapping_bandwidth: |
| - Configures TC mapping using mqprio qdisc |
| - Verifies that with TC mapping, bandwidth IS distributed according to the |
| configured 80/20 split between TC3 and TC4 |
| - Expected: Bandwidth should be distributed as 80/20 |
| |
| Bandwidth Distribution: |
| ---------------------- |
| - TC3 (VLAN 101): Configured for 80% of total bandwidth |
| - TC4 (VLAN 102): Configured for 20% of total bandwidth |
| - Total bandwidth: 1Gbps |
| - Tolerance: +-12% |
| |
| Hardware-Specific Behavior (mlx5): |
| -------------------------- |
| mlx5 hardware enforces traffic class separation by ensuring that each transmit |
| queue (SQ) is associated with a single TC. If a packet is sent on a queue that |
| doesn't match the expected TC (based on DSCP or VLAN priority and hypervisor-set |
| mapping), the hardware moves the queue to the correct TC scheduler to preserve |
| traffic isolation. |
| |
| This behavior means that even without explicit TC-to-queue mapping, bandwidth |
| enforcement may still appear to work—because the hardware dynamically adjusts |
| the scheduling context. However, this can lead to performance issues in high |
| rates and HOL blocking if traffic from different TCs is mixed on the same queue. |
| """ |
| |
| import json |
| import os |
| import subprocess |
| import threading |
| import time |
| |
| from lib.py import ksft_pr, ksft_run, ksft_exit |
| from lib.py import KsftSkipEx, KsftFailEx, KsftXfailEx |
| from lib.py import NetDrvEpEnv, DevlinkFamily |
| from lib.py import NlError |
| from lib.py import cmd, defer, ethtool, ip |
| |
| |
| class BandwidthValidator: |
| """ |
| Validates bandwidth totals and per-TC shares against expected values |
| with a tolerance. |
| """ |
| |
| def __init__(self): |
| self.tolerance_percent = 12 |
| self.expected_total_gbps = 1.0 |
| self.total_min_expected = self.min_expected(self.expected_total_gbps) |
| self.total_max_expected = self.max_expected(self.expected_total_gbps) |
| self.tc_expected_percent = { |
| 3: 20.0, |
| 4: 80.0, |
| } |
| |
| def min_expected(self, value): |
| """Calculates the minimum acceptable value based on tolerance.""" |
| return value - (value * self.tolerance_percent / 100) |
| |
| def max_expected(self, value): |
| """Calculates the maximum acceptable value based on tolerance.""" |
| return value + (value * self.tolerance_percent / 100) |
| |
| def bound(self, expected, value): |
| """Returns True if value is within expected tolerance.""" |
| return self.min_expected(expected) <= value <= self.max_expected(expected) |
| |
| def tc_bandwidth_bound(self, value, tc_ix): |
| """ |
| Returns True if the given bandwidth value is within tolerance |
| for the TC's expected bandwidth. |
| """ |
| expected = self.tc_expected_percent[tc_ix] |
| return self.bound(expected, value) |
| |
| |
| def setup_vf(cfg, set_tc_mapping=True): |
| """ |
| Sets up a VF on the given network interface. |
| |
| Enables SR-IOV and switchdev mode, brings the VF interface up, |
| and optionally configures TC mapping using mqprio. |
| """ |
| try: |
| cmd(f"devlink dev eswitch set pci/{cfg.pci} mode switchdev") |
| defer(cmd, f"devlink dev eswitch set pci/{cfg.pci} mode legacy") |
| except Exception as exc: |
| raise KsftSkipEx(f"Failed to enable switchdev mode on {cfg.pci}") from exc |
| try: |
| cmd(f"echo 1 > /sys/class/net/{cfg.ifname}/device/sriov_numvfs") |
| defer(cmd, f"echo 0 > /sys/class/net/{cfg.ifname}/device/sriov_numvfs") |
| except Exception as exc: |
| raise KsftSkipEx(f"Failed to enable SR-IOV on {cfg.ifname}") from exc |
| |
| time.sleep(2) |
| vf_ifc = (os.listdir( |
| f"/sys/class/net/{cfg.ifname}/device/virtfn0/net") or [None])[0] |
| if vf_ifc: |
| ip(f"link set dev {vf_ifc} up") |
| else: |
| raise KsftSkipEx("VF interface not found") |
| if set_tc_mapping: |
| cmd(f"tc qdisc add dev {vf_ifc} root handle 5 mqprio mode dcb hw 1 num_tc 8") |
| |
| return vf_ifc |
| |
| |
| def setup_vlans_on_vf(vf_ifc): |
| """ |
| Sets up two VLAN interfaces on the given VF, each mapped to a different TC. |
| """ |
| vlan_configs = [ |
| {"vlan_id": 101, "tc": 3, "ip": "198.51.100.2"}, |
| {"vlan_id": 102, "tc": 4, "ip": "198.51.100.10"}, |
| ] |
| |
| for config in vlan_configs: |
| vlan_dev = f"{vf_ifc}.{config['vlan_id']}" |
| ip(f"link add link {vf_ifc} name {vlan_dev} type vlan id {config['vlan_id']}") |
| ip(f"addr add {config['ip']}/29 dev {vlan_dev}") |
| ip(f"link set dev {vlan_dev} up") |
| ip(f"link set dev {vlan_dev} type vlan egress-qos-map 0:{config['tc']}") |
| ksft_pr(f"Created VLAN {vlan_dev} on {vf_ifc} with tc {config['tc']} and IP {config['ip']}") |
| |
| |
| def get_vf_info(cfg): |
| """ |
| Finds the VF representor interface and devlink port index |
| for the given PCI device used in the test environment. |
| """ |
| cfg.vf_representor = None |
| cfg.vf_port_index = None |
| out = subprocess.check_output(["devlink", "-j", "port", "show"], encoding="utf-8") |
| ports = json.loads(out)["port"] |
| |
| for port_name, props in ports.items(): |
| netdev = props.get("netdev") |
| |
| if (port_name.startswith(f"pci/{cfg.pci}/") and |
| props.get("vfnum") == 0): |
| cfg.vf_representor = netdev |
| cfg.vf_port_index = int(port_name.split("/")[-1]) |
| break |
| |
| |
| def setup_bridge(cfg): |
| """ |
| Creates and configures a Linux bridge, with both the uplink |
| and VF representor interfaces attached to it. |
| """ |
| bridge_name = f"br_{os.getpid()}" |
| ip(f"link add name {bridge_name} type bridge") |
| defer(cmd, f"ip link del name {bridge_name} type bridge") |
| |
| ip(f"link set dev {cfg.ifname} master {bridge_name}") |
| |
| rep_name = cfg.vf_representor |
| if rep_name: |
| ip(f"link set dev {rep_name} master {bridge_name}") |
| ip(f"link set dev {rep_name} up") |
| ksft_pr(f"Set representor {rep_name} up and added to bridge") |
| else: |
| raise KsftSkipEx("Could not find representor for the VF") |
| |
| ip(f"link set dev {bridge_name} up") |
| |
| |
| def setup_devlink_rate(cfg): |
| """ |
| Configures devlink rate tx_max and traffic class bandwidth for the VF. |
| """ |
| port_index = cfg.vf_port_index |
| if port_index is None: |
| raise KsftSkipEx("Could not find VF port index") |
| try: |
| cfg.devnl.rate_set({ |
| "bus-name": "pci", |
| "dev-name": cfg.pci, |
| "port-index": port_index, |
| "rate-tx-max": 125000000, |
| "rate-tc-bws": [ |
| {"index": 0, "bw": 0}, |
| {"index": 1, "bw": 0}, |
| {"index": 2, "bw": 0}, |
| {"index": 3, "bw": 20}, |
| {"index": 4, "bw": 80}, |
| {"index": 5, "bw": 0}, |
| {"index": 6, "bw": 0}, |
| {"index": 7, "bw": 0}, |
| ] |
| }) |
| except NlError as exc: |
| if exc.error == 95: # EOPNOTSUPP |
| raise KsftSkipEx("devlink rate configuration is not supported on the VF") from exc |
| raise KsftFailEx(f"rate_set failed on VF port {port_index}") from exc |
| |
| |
| def setup_remote_server(cfg): |
| """ |
| Sets up VLAN interfaces and starts iperf3 servers on the remote side. |
| """ |
| remote_dev = cfg.remote_ifname |
| vlan_ids = [101, 102] |
| remote_ips = ["198.51.100.1", "198.51.100.9"] |
| |
| for vlan_id, ip_addr in zip(vlan_ids, remote_ips): |
| vlan_dev = f"{remote_dev}.{vlan_id}" |
| cmd(f"ip link add link {remote_dev} name {vlan_dev} " |
| f"type vlan id {vlan_id}", host=cfg.remote) |
| cmd(f"ip addr add {ip_addr}/29 dev {vlan_dev}", host=cfg.remote) |
| cmd(f"ip link set dev {vlan_dev} up", host=cfg.remote) |
| cmd(f"iperf3 -s -1 -B {ip_addr}",background=True, host=cfg.remote) |
| defer(cmd, f"ip link del {vlan_dev}", host=cfg.remote) |
| |
| |
| def setup_test_environment(cfg, set_tc_mapping=True): |
| """ |
| Sets up the complete test environment including VF creation, VLANs, |
| bridge configuration, devlink rate setup, and the remote server. |
| """ |
| vf_ifc = setup_vf(cfg, set_tc_mapping) |
| ksft_pr(f"Created VF interface: {vf_ifc}") |
| |
| setup_vlans_on_vf(vf_ifc) |
| |
| get_vf_info(cfg) |
| setup_bridge(cfg) |
| |
| setup_devlink_rate(cfg) |
| setup_remote_server(cfg) |
| time.sleep(2) |
| |
| |
| def run_iperf_client(server_ip, local_ip, barrier, min_expected_gbps=0.1): |
| """ |
| Runs a single iperf3 client instance, binding to the given local IP. |
| Waits on a barrier to synchronize with other threads. |
| """ |
| try: |
| barrier.wait(timeout=10) |
| except Exception as exc: |
| raise KsftFailEx("iperf3 barrier wait timed") from exc |
| |
| iperf_cmd = ["iperf3", "-c", server_ip, "-B", local_ip, "-J"] |
| result = subprocess.run(iperf_cmd, capture_output=True, text=True, |
| check=True) |
| |
| try: |
| output = json.loads(result.stdout) |
| bits_per_second = output["end"]["sum_received"]["bits_per_second"] |
| gbps = bits_per_second / 1e9 |
| if gbps < min_expected_gbps: |
| ksft_pr( |
| f"iperf3 bandwidth too low: {gbps:.2f} Gbps " |
| f"(expected ≥ {min_expected_gbps} Gbps)" |
| ) |
| return None |
| return gbps |
| except json.JSONDecodeError as exc: |
| ksft_pr(f"Failed to parse iperf3 JSON output: {exc}") |
| return None |
| |
| |
| def run_bandwidth_test(): |
| """ |
| Launches iperf3 client threads for each VLAN/TC pair and collects results. |
| """ |
| def _run_iperf_client_thread(server_ip, local_ip, results, barrier, tc_ix): |
| results[tc_ix] = run_iperf_client(server_ip, local_ip, barrier) |
| |
| vf_vlan_data = [ |
| # (local_ip, remote_ip, TC) |
| ("198.51.100.2", "198.51.100.1", 3), |
| ("198.51.100.10", "198.51.100.9", 4), |
| ] |
| |
| results = {} |
| threads = [] |
| start_barrier = threading.Barrier(len(vf_vlan_data)) |
| |
| for local_ip, remote_ip, tc_ix in vf_vlan_data: |
| thread = threading.Thread( |
| target=_run_iperf_client_thread, |
| args=(remote_ip, local_ip, results, start_barrier, tc_ix) |
| ) |
| thread.start() |
| threads.append(thread) |
| |
| for thread in threads: |
| thread.join() |
| |
| for tc_ix, tc_bw in results.items(): |
| if tc_bw is None: |
| raise KsftFailEx("iperf3 client failed; cannot evaluate bandwidth") |
| |
| return results |
| |
| def calculate_bandwidth_percentages(results): |
| """ |
| Calculates the percentage of total bandwidth received by TC3 and TC4. |
| """ |
| if 3 not in results or 4 not in results: |
| raise KsftFailEx(f"Missing expected TC results in {results}") |
| |
| tc3_bw = results[3] |
| tc4_bw = results[4] |
| total_bw = tc3_bw + tc4_bw |
| tc3_percentage = (tc3_bw / total_bw) * 100 |
| tc4_percentage = (tc4_bw / total_bw) * 100 |
| |
| return { |
| 'tc3_bw': tc3_bw, |
| 'tc4_bw': tc4_bw, |
| 'tc3_percentage': tc3_percentage, |
| 'tc4_percentage': tc4_percentage, |
| 'total_bw': total_bw |
| } |
| |
| |
| def print_bandwidth_results(bw_data, test_name): |
| """ |
| Prints bandwidth measurements and TC usage summary for a given test. |
| """ |
| ksft_pr(f"Bandwidth check results {test_name}:") |
| ksft_pr(f"TC 3: {bw_data['tc3_bw']:.2f} Gbits/sec") |
| ksft_pr(f"TC 4: {bw_data['tc4_bw']:.2f} Gbits/sec") |
| ksft_pr(f"Total bandwidth: {bw_data['total_bw']:.2f} Gbits/sec") |
| ksft_pr(f"TC 3 percentage: {bw_data['tc3_percentage']:.1f}%") |
| ksft_pr(f"TC 4 percentage: {bw_data['tc4_percentage']:.1f}%") |
| |
| |
| def verify_total_bandwidth(bw_data, validator): |
| """ |
| Ensures the total measured bandwidth falls within the acceptable tolerance. |
| """ |
| total = bw_data['total_bw'] |
| |
| if validator.bound(validator.expected_total_gbps, total): |
| return |
| |
| if total < validator.total_min_expected: |
| raise KsftSkipEx( |
| f"Total bandwidth {total:.2f} Gbps < minimum " |
| f"{validator.total_min_expected:.2f} Gbps; " |
| f"parent tx_max ({validator.expected_total_gbps:.1f} G) " |
| f"not reached, cannot validate share" |
| ) |
| |
| raise KsftFailEx( |
| f"Total bandwidth {total:.2f} Gbps exceeds allowed ceiling " |
| f"{validator.total_max_expected:.2f} Gbps " |
| f"(VF tx_max set to {validator.expected_total_gbps:.1f} G)" |
| ) |
| |
| |
| def check_bandwidth_distribution(bw_data, validator): |
| """ |
| Checks whether the measured TC3 and TC4 bandwidth percentages |
| fall within their expected tolerance ranges. |
| |
| Returns: |
| bool: True if both TC3 and TC4 percentages are within bounds. |
| """ |
| tc3_valid = validator.tc_bandwidth_bound(bw_data['tc3_percentage'], 3) |
| tc4_valid = validator.tc_bandwidth_bound(bw_data['tc4_percentage'], 4) |
| |
| return tc3_valid and tc4_valid |
| |
| |
| def run_bandwidth_distribution_test(cfg, set_tc_mapping): |
| """ |
| Runs parallel iperf3 tests for both TCs and collects results. |
| """ |
| setup_test_environment(cfg, set_tc_mapping) |
| bandwidths = run_bandwidth_test() |
| bw_data = calculate_bandwidth_percentages(bandwidths) |
| test_name = "with TC mapping" if set_tc_mapping else "without TC mapping" |
| print_bandwidth_results(bw_data, test_name) |
| |
| verify_total_bandwidth(bw_data, cfg.bw_validator) |
| |
| return check_bandwidth_distribution(bw_data, cfg.bw_validator) |
| |
| |
| def test_no_tc_mapping_bandwidth(cfg): |
| """ |
| Verifies that bandwidth is not split 80/20 without traffic class mapping. |
| """ |
| pass_bw_msg = "Bandwidth is NOT distributed as 80/20 without TC mapping" |
| fail_bw_msg = "Bandwidth matched 80/20 split without TC mapping" |
| is_mlx5 = "driver: mlx5" in ethtool(f"-i {cfg.ifname}").stdout |
| |
| if run_bandwidth_distribution_test(cfg, set_tc_mapping=False): |
| if is_mlx5: |
| raise KsftXfailEx(fail_bw_msg) |
| raise KsftFailEx(fail_bw_msg) |
| if is_mlx5: |
| raise KsftFailEx("mlx5 behavior changed:" + pass_bw_msg) |
| ksft_pr(pass_bw_msg) |
| |
| |
| def test_tc_mapping_bandwidth(cfg): |
| """ |
| Verifies that bandwidth is correctly split 80/20 between TC3 and TC4 |
| when traffic class mapping is set. |
| """ |
| if run_bandwidth_distribution_test(cfg, set_tc_mapping=True): |
| ksft_pr("Bandwidth is distributed as 80/20 with TC mapping") |
| else: |
| raise KsftFailEx("Bandwidth did not match 80/20 split with TC mapping") |
| |
| |
| def main() -> None: |
| """ |
| Main entry point for running the test cases. |
| """ |
| with NetDrvEpEnv(__file__, nsim_test=False) as cfg: |
| cfg.devnl = DevlinkFamily() |
| |
| cfg.pci = os.path.basename( |
| os.path.realpath(f"/sys/class/net/{cfg.ifname}/device") |
| ) |
| if not cfg.pci: |
| raise KsftSkipEx("Could not get PCI address of the interface") |
| cfg.require_cmd("iperf3", local=True, remote=True) |
| |
| cfg.bw_validator = BandwidthValidator() |
| |
| cases = [test_no_tc_mapping_bandwidth, test_tc_mapping_bandwidth] |
| |
| ksft_run(cases=cases, args=(cfg,)) |
| ksft_exit() |
| |
| |
| if __name__ == "__main__": |
| main() |