We’re trying to enable RDMA on vSAN 8 (vCenter: 8.0.1 build: 21860503, VMware ESXi, 8.0.1, 21813344) with our ConnextX5 NICs but running into challenges getting everything configured.
No matter what we configure, we can’t get our NIC to show “Mode: 3 - IEEE mode”. It remains in Mode: 0 - Unknown.
[root@1:~] esxcli network nic dcb status get -n vmnic0 Nic Name: vmnic0 Mode: 0 - Unknown Enabled: true Capabilities: Priority Group: true Priority Flow Control: true PG Traffic Classes: 8 PFC Traffic Classes: 8 PFC Enabled: true PFC Configuration: 0 0 0 1 0 0 0 0 IEEE ETS Configuration: Willing Bit In ETS Config TLV: 1 Supported Capacity: 8 Credit Based Shaper ETS Algorithm Supported: 0x0 TX Bandwidth Per TC: 13 13 13 13 12 12 12 12 RX Bandwidth Per TC: 13 13 13 13 12 12 12 12 TSA Assignment Table Per TC: 2 2 2 2 2 2 2 2 Priority Assignment Per TC: 1 0 2 3 4 5 6 7 Recommended TC Bandwidth Per TC: 13 13 13 13 12 12 12 12 Recommended TSA Assignment Per TC: 2 2 2 2 2 2 2 2 Recommended Priority Assignment Per TC: 1 0 2 3 4 5 6 7 IEEE PFC Configuration: Number Of Traffic Classes: 8 PFC Configuration: 0 0 0 0 0 0 0 0 Macsec Bypass Capability Is Enabled: 0 Round Trip Propagation Delay Of Link: 0 Sent PFC Frames: 0 0 0 0 0 0 0 0 Received PFC Frames: 0 0 0 0 0 0 0 0 DCB Apps:
I’ve configured the NIC and ESXi like this:
/opt/mellanox/bin/mlxconfig -d mt4119_pciconf0 set DCBX_IEEE_P1=1 DCBX_CEE_P1=0 DCBX_IEEE_P2=1 DCBX_CEE_P2=0 LLDP_NB_DCBX_P1=1 LLDP_NB_DCBX_P2=1 esxcli system module parameters set -m nmlx5_core -p dcbx=1 esxcli system module parameters set -m nmlx5_core -p "pfctx=0x08 pfcrx=0x08 trust_state=2 max_vfs=4" esxcli system module parameters set -m nmlx5_rdma -p "pcp_force=3 dscp_force=26"
[root@1:~] /opt/mellanox/bin/mlxconfig -d mt4119_pciconf0 query Device #1: ---------- Device type: ConnectX5 Name: 879482-B21_Ax Description: HPE InfiniBand FDR/Ethernet 40/50Gb 2-port 547FLR-QSFP Adapter Device: mt4119_pciconf0 Configurations: Next Boot MEMIC_BAR_SIZE 0 MEMIC_SIZE_LIMIT _256KB(1) HOST_CHAINING_MODE DISABLED(0) HOST_CHAINING_CACHE_DISABLE False(0) HOST_CHAINING_DESCRIPTORS Array[0..7] HOST_CHAINING_TOTAL_BUFFER_SIZE Array[0..7] FLEX_PARSER_PROFILE_ENABLE 0 FLEX_IPV4_OVER_VXLAN_PORT 0 ROCE_NEXT_PROTOCOL 254 ESWITCH_HAIRPIN_DESCRIPTORS Array[0..7] ESWITCH_HAIRPIN_TOT_BUFFER_SIZE Array[0..7] PF_BAR2_SIZE 0 PF_NUM_OF_VF_VALID False(0) NON_PREFETCHABLE_PF_BAR False(0) VF_VPD_ENABLE False(0) PF_NUM_PF_MSIX_VALID False(0) PER_PF_NUM_SF False(0) STRICT_VF_MSIX_NUM False(0) VF_NODNIC_ENABLE False(0) NUM_PF_MSIX_VALID True(1) NUM_OF_VFS 8 NUM_OF_PF 2 PF_BAR2_ENABLE False(0) SRIOV_EN True(1) PF_LOG_BAR_SIZE 5 VF_LOG_BAR_SIZE 0 NUM_PF_MSIX 63 NUM_VF_MSIX 11 INT_LOG_MAX_PAYLOAD_SIZE AUTOMATIC(0) PCIE_CREDIT_TOKEN_TIMEOUT 0 ACCURATE_TX_SCHEDULER False(0) PARTIAL_RESET_EN False(0) SW_RECOVERY_ON_ERRORS False(0) RESET_WITH_HOST_ON_ERRORS False(0) ADVANCED_POWER_SETTINGS False(0) CQE_COMPRESSION BALANCED(0) IP_OVER_VXLAN_EN False(0) MKEY_BY_NAME False(0) ESWITCH_IPV4_TTL_MODIFY_ENABLE False(0) PRIO_TAG_REQUIRED_EN False(0) UCTX_EN True(1) PCI_ATOMIC_MODE PCI_ATOMIC_DISABLED_EXT_ATOMIC_ENABLED(0) TUNNEL_ECN_COPY_DISABLE False(0) LRO_LOG_TIMEOUT0 6 LRO_LOG_TIMEOUT1 7 LRO_LOG_TIMEOUT2 8 LRO_LOG_TIMEOUT3 13 LOG_TX_PSN_WINDOW 7 LOG_MAX_OUTSTANDING_WQE 7 ROCE_ADAPTIVE_ROUTING_EN False(0) TUNNEL_IP_PROTO_ENTROPY_DISABLE False(0) ICM_CACHE_MODE DEVICE_DEFAULT(0) TX_SCHEDULER_BURST 0 ZERO_TOUCH_TUNING_ENABLE False(0) LOG_MAX_QUEUE 17 LOG_DCR_HASH_TABLE_SIZE 11 MAX_PACKET_LIFETIME 0 DCR_LIFO_SIZE 16384 LINK_TYPE_P1 ETH(2) LINK_TYPE_P2 ETH(2) ROCE_CC_PRIO_MASK_P1 255 ROCE_CC_PRIO_MASK_P2 255 CLAMP_TGT_RATE_AFTER_TIME_INC_P1 True(1) CLAMP_TGT_RATE_P1 False(0) RPG_TIME_RESET_P1 300 RPG_BYTE_RESET_P1 32767 RPG_THRESHOLD_P1 1 RPG_MAX_RATE_P1 0 RPG_AI_RATE_P1 5 RPG_HAI_RATE_P1 50 RPG_GD_P1 11 RPG_MIN_DEC_FAC_P1 50 RPG_MIN_RATE_P1 1 RATE_TO_SET_ON_FIRST_CNP_P1 0 DCE_TCP_G_P1 1019 DCE_TCP_RTT_P1 1 RATE_REDUCE_MONITOR_PERIOD_P1 4 INITIAL_ALPHA_VALUE_P1 1023 MIN_TIME_BETWEEN_CNPS_P1 4 CNP_802P_PRIO_P1 6 CNP_DSCP_P1 48 CLAMP_TGT_RATE_AFTER_TIME_INC_P2 True(1) CLAMP_TGT_RATE_P2 False(0) RPG_TIME_RESET_P2 300 RPG_BYTE_RESET_P2 32767 RPG_THRESHOLD_P2 1 RPG_MAX_RATE_P2 0 RPG_AI_RATE_P2 5 RPG_HAI_RATE_P2 50 RPG_GD_P2 11 RPG_MIN_DEC_FAC_P2 50 RPG_MIN_RATE_P2 1 RATE_TO_SET_ON_FIRST_CNP_P2 0 DCE_TCP_G_P2 1019 DCE_TCP_RTT_P2 1 RATE_REDUCE_MONITOR_PERIOD_P2 4 INITIAL_ALPHA_VALUE_P2 1023 MIN_TIME_BETWEEN_CNPS_P2 4 CNP_802P_PRIO_P2 6 CNP_DSCP_P2 48 LLDP_NB_DCBX_P1 True(1) LLDP_NB_RX_MODE_P1 ALL(2) LLDP_NB_TX_MODE_P1 ALL(2) LLDP_NB_DCBX_P2 True(1) LLDP_NB_RX_MODE_P2 ALL(2) LLDP_NB_TX_MODE_P2 ALL(2) ROCE_RTT_RESP_DSCP_P1 0 ROCE_RTT_RESP_DSCP_MODE_P1 DEVICE_DEFAULT(0) ROCE_RTT_RESP_DSCP_P2 0 ROCE_RTT_RESP_DSCP_MODE_P2 DEVICE_DEFAULT(0) DCBX_IEEE_P1 True(1) DCBX_CEE_P1 False(0) DCBX_WILLING_P1 True(1) DCBX_IEEE_P2 True(1) DCBX_CEE_P2 False(0) DCBX_WILLING_P2 True(1) KEEP_ETH_LINK_UP_P1 True(1) KEEP_IB_LINK_UP_P1 False(0) KEEP_LINK_UP_ON_BOOT_P1 False(0) KEEP_LINK_UP_ON_STANDBY_P1 False(0) DO_NOT_CLEAR_PORT_STATS_P1 False(0) AUTO_POWER_SAVE_LINK_DOWN_P1 False(0) KEEP_ETH_LINK_UP_P2 True(1) KEEP_IB_LINK_UP_P2 False(0) KEEP_LINK_UP_ON_BOOT_P2 False(0) KEEP_LINK_UP_ON_STANDBY_P2 False(0) DO_NOT_CLEAR_PORT_STATS_P2 False(0) AUTO_POWER_SAVE_LINK_DOWN_P2 False(0) NUM_OF_VL_P1 _4_VLs(3) NUM_OF_TC_P1 _8_TCs(0) NUM_OF_PFC_P1 8 VL15_BUFFER_SIZE_P1 0 QOS_TRUST_STATE_P1 TRUST_PCP(1) NUM_OF_VL_P2 _4_VLs(3) NUM_OF_TC_P2 _8_TCs(0) NUM_OF_PFC_P2 8 VL15_BUFFER_SIZE_P2 0 QOS_TRUST_STATE_P2 TRUST_PCP(1) DUP_MAC_ACTION_P1 LAST_CFG(0) MPFS_MC_LOOPBACK_DISABLE_P1 False(0) MPFS_UC_LOOPBACK_DISABLE_P1 False(0) UNKNOWN_UPLINK_MAC_FLOOD_P1 False(0) SRIOV_IB_ROUTING_MODE_P1 LID(1) IB_ROUTING_MODE_P1 LID(1) DUP_MAC_ACTION_P2 LAST_CFG(0) MPFS_MC_LOOPBACK_DISABLE_P2 False(0) MPFS_UC_LOOPBACK_DISABLE_P2 False(0) UNKNOWN_UPLINK_MAC_FLOOD_P2 False(0) SRIOV_IB_ROUTING_MODE_P2 LID(1) IB_ROUTING_MODE_P2 LID(1) PHY_AUTO_NEG_P1 DEVICE_DEFAULT(0) PHY_RATE_MASK_OVERRIDE_P1 False(0) PHY_FEC_OVERRIDE_P1 DEVICE_DEFAULT(0) PHY_AUTO_NEG_P2 DEVICE_DEFAULT(0) PHY_RATE_MASK_OVERRIDE_P2 False(0) PHY_FEC_OVERRIDE_P2 DEVICE_DEFAULT(0) PF_TOTAL_SF 0 PF_SF_BAR_SIZE 0 PF_NUM_PF_MSIX 63 ROCE_CONTROL ROCE_ENABLE(2) PCI_WR_ORDERING per_mkey(0) MULTI_PORT_VHCA_EN False(0) PORT_OWNER True(1) ALLOW_RD_COUNTERS True(1) RENEG_ON_CHANGE True(1) TRACER_ENABLE True(1) IP_VER IPv4(0) BOOT_UNDI_NETWORK_WAIT 0 UEFI_HII_EN True(1) BOOT_DBG_LOG False(0) UEFI_LOGS DISABLED(0) BOOT_VLAN 1 LEGACY_BOOT_PROTOCOL PXE(1) BOOT_INTERRUPT_DIS False(0) BOOT_LACP_DIS True(1) BOOT_VLAN_EN False(0) BOOT_PKEY 0 P2P_ORDERING_MODE DEVICE_DEFAULT(0) ATS_ENABLED False(0) DYNAMIC_VF_MSIX_TABLE False(0) EXP_ROM_UEFI_x86_ENABLE True(1) EXP_ROM_PXE_ENABLE True(1) ADVANCED_PCI_SETTINGS False(0) SAFE_MODE_THRESHOLD 10 SAFE_MODE_ENABLE True(1)
2023-10-02T21:50:02.178Z Wa(180) vmkwarning: cpu14:2098420)WARNING: rdmaDriver: RDMAFindTeamDeviceByPortID:3138: Unspported team policy = 8 status = Success
2023-10-02T21:50:02.178Z Wa(180) vmkwarning: cpu14:2098420)WARNING: rdmaDriver: RDMACM_BindLegacy:4297: The provided interface (<redacted>) does not have a registered rdma device.
2023-10-02T21:50:02.178Z In(182) vmkernel: cpu14:2098420)RDT: RDTCreateRDMAServer:2754: vmk_RDMACMBind() failed for server Bad parameter
2023-10-02T21:50:02.178Z In(182) vmkernel: cpu14:2098420)RDT: RDTCreateRDMAServer:2787: RDTCreateRDMAServer() exiting with failure
2023-10-02T21:50:02.178Z In(182) vmkernel: cpu14:2098420)RDT: RDTEnableRdmaInt:642: Failed to create listener for address <redacted>, protocol 2, status Bad parameter
2023-10-02T21:50:07.181Z In(182) vmkernel: cpu14:2098420)RDT: RDTDisableRdmaInt:722: SupportedTransportProtocolsMask removes RDMA
2023-10-02T21:50:07.181Z Wa(180) vmkwarning: cpu14:2098420)WARNING: rdmaDriver: RDMAFindTeamDeviceByPortID:3138: Unspported team policy = 8 status = Success
2023-10-02T21:50:07.181Z Wa(180) vmkwarning: cpu14:2098420)WARNING: rdmaDriver: RDMACM_BindLegacy:4297: The provided interface (<redacted>) does not have a registered rdma device.
2023-10-02T21:50:07.181Z In(182) vmkernel: cpu14:2098420)RDT: RDTCreateRDMAServer:2754: vmk_RDMACMBind() failed for server Bad parameter
2023-10-02T21:50:07.181Z In(182) vmkernel: cpu14:2098420)RDT: RDTCreateRDMAServer:2787: RDTCreateRDMAServer() exiting with failure
2023-10-02T21:50:07.181Z In(182) vmkernel: cpu14:2098420)RDT: RDTEnableRdmaInt:642: Failed to create listener for address <redacted>, protocol 2, status Bad parameter
We followed this guide for the switches:
https://enterprise-support.nvidia.com/s/article/qos-configuration-examples-for-cisco-nexus-5600 5
We also gathered the information from:
https://www.reddit.com/r/vmware/comments/ozhq6j/vsan_rdma_with_mellanox_nic/ 8
Any idea what we may be missing or what else we can try? Everything looks like we should be able to use RDMA, yet here we are.