From a31c5dca12d81f81f3aaf9629e8c6ea0660fd06d Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Thu, 30 Oct 2025 06:59:47 -0700 Subject: [PATCH] tensile fedora gpus --- Tensile/AsmCaps.py | 176 ++++++++++++++++++ Tensile/Common.py | 8 +- Tensile/Source/lib/include/Tensile/AMDGPU.hpp | 25 ++- .../include/Tensile/PlaceholderLibrary.hpp | 17 +- 4 files changed, 216 insertions(+), 10 deletions(-) diff --git a/Tensile/AsmCaps.py b/Tensile/AsmCaps.py index cacc1848b7e0..41330270c618 100644 --- a/Tensile/AsmCaps.py +++ b/Tensile/AsmCaps.py @@ -683,6 +683,50 @@ def getCapabilitiesCache(rocmVersion: NamedTuple) -> dict: 'VOP3v_dot4_i32_i8': False, 'v_dot2_f32_f16': True, 'v_dot2c_f32_f16': True, + (10, 3, 5): {'HasAddLshl': True, + 'HasAtomicAdd': False, + 'HasDirectToLdsDest': False, + 'HasDirectToLdsNoDest': True, + 'HasExplicitCO': True, + 'HasExplicitNC': True, + 'HasGLCModifier': True, + 'HasNTModifier': False, + 'HasLshlOr': True, + 'HasMFMA': False, + 'HasMFMA_b8': False, + 'HasMFMA_bf16_1k': False, + 'HasMFMA_bf16_original': False, + 'HasMFMA_constSrc': False, + 'HasMFMA_f64': False, + 'HasMFMA_f8': False, + 'HasMFMA_i8_908': False, + 'HasMFMA_i8_940': False, + 'HasMFMA_vgpr': False, + 'HasMFMA_xf32': False, + 'HasSMulHi': True, + 'HasWMMA': False, + 'KernargPreloading': False, + 'MaxLgkmcnt': 15, + 'MaxVmcnt': 63, + 'SupportedISA': True, + 'SupportedSource': True, + 'VOP3v_dot4_i32_i8': True, + 'v_dot2_f32_f16': True, + 'v_dot2c_f32_f16': True, + 'v_dot4_i32_i8': False, + 'v_dot4c_i32_i8': True, + 'v_fma_f16': True, + 'v_fma_f32': True, + 'v_fma_f64': True, + 'v_fma_mix_f32': True, + 'v_fmac_f16': False, + 'v_fmac_f32': True, + 'v_mac_f16': False, + 'v_mac_f32': False, + 'v_mad_mix_f32': False, + 'v_mov_b64': False, + 'v_pk_fma_f16': True, + 'v_pk_fmac_f16': False}, 'v_dot4_i32_i8': False, 'v_dot4c_i32_i8': False, 'v_fma_f16': True, @@ -859,6 +903,94 @@ def getCapabilitiesCache(rocmVersion: NamedTuple) -> dict: 'VOP3v_dot4_i32_i8': False, 'v_dot2_f32_f16': True, 'v_dot2c_f32_f16': True, + (11, 0, 3): {'HasAddLshl': True, + 'HasAtomicAdd': True, + 'HasDirectToLdsDest': False, + 'HasDirectToLdsNoDest': False, + 'HasExplicitCO': True, + 'HasExplicitNC': True, + 'HasGLCModifier': True, + 'HasNTModifier': False, + 'HasLshlOr': True, + 'HasMFMA': False, + 'HasMFMA_b8': False, + 'HasMFMA_bf16_1k': False, + 'HasMFMA_bf16_original': False, + 'HasMFMA_constSrc': False, + 'HasMFMA_f64': False, + 'HasMFMA_f8': False, + 'HasMFMA_i8_908': False, + 'HasMFMA_i8_940': False, + 'HasMFMA_vgpr': False, + 'HasMFMA_xf32': False, + 'HasSMulHi': True, + 'HasWMMA': True, + 'KernargPreloading': False, + 'MaxLgkmcnt': 15, + 'MaxVmcnt': 63, + 'SupportedISA': True, + 'SupportedSource': True, + 'VOP3v_dot4_i32_i8': False, + 'v_dot2_f32_f16': True, + 'v_dot2c_f32_f16': True, + 'v_dot4_i32_i8': False, + 'v_dot4c_i32_i8': False, + 'v_fma_f16': True, + 'v_fma_f32': True, + 'v_fma_f64': True, + 'v_fma_mix_f32': True, + 'v_fmac_f16': False, + 'v_fmac_f32': True, + 'v_mac_f16': False, + 'v_mac_f32': False, + 'v_mad_mix_f32': False, + 'v_mov_b64': False, + 'v_pk_fma_f16': True, + 'v_pk_fmac_f16': False}, + (11, 5, 0): {'HasAddLshl': True, + 'HasAtomicAdd': True, + 'HasDirectToLdsDest': False, + 'HasDirectToLdsNoDest': False, + 'HasExplicitCO': True, + 'HasExplicitNC': True, + 'HasGLCModifier': True, + 'HasNTModifier': False, + 'HasLshlOr': True, + 'HasMFMA': False, + 'HasMFMA_b8': False, + 'HasMFMA_bf16_1k': False, + 'HasMFMA_bf16_original': False, + 'HasMFMA_constSrc': False, + 'HasMFMA_f64': False, + 'HasMFMA_f8': False, + 'HasMFMA_i8_908': False, + 'HasMFMA_i8_940': False, + 'HasMFMA_vgpr': False, + 'HasMFMA_xf32': False, + 'HasSMulHi': True, + 'HasWMMA': True, + 'KernargPreloading': False, + 'MaxLgkmcnt': 15, + 'MaxVmcnt': 63, + 'SupportedISA': True, + 'SupportedSource': True, + 'VOP3v_dot4_i32_i8': False, + 'v_dot2_f32_f16': True, + 'v_dot2c_f32_f16': True, + 'v_dot4_i32_i8': False, + 'v_dot4c_i32_i8': False, + 'v_fma_f16': True, + 'v_fma_f32': True, + 'v_fma_f64': True, + 'v_fma_mix_f32': True, + 'v_fmac_f16': False, + 'v_fmac_f32': True, + 'v_mac_f16': False, + 'v_mac_f32': False, + 'v_mad_mix_f32': False, + 'v_mov_b64': False, + 'v_pk_fma_f16': True, + 'v_pk_fmac_f16': False}, 'v_dot4_i32_i8': False, 'v_dot4c_i32_i8': False, 'v_fma_f16': True, @@ -947,6 +1079,50 @@ def getCapabilitiesCache(rocmVersion: NamedTuple) -> dict: 'VOP3v_dot4_i32_i8': False, 'v_dot2_f32_f16': True, 'v_dot2c_f32_f16': False, + (11, 5, 2): {'HasAddLshl': True, + 'HasAtomicAdd': True, + 'HasDirectToLdsDest': False, + 'HasDirectToLdsNoDest': False, + 'HasExplicitCO': True, + 'HasExplicitNC': True, + 'HasGLCModifier': True, + 'HasNTModifier': False, + 'HasLshlOr': True, + 'HasMFMA': False, + 'HasMFMA_b8': False, + 'HasMFMA_bf16_1k': False, + 'HasMFMA_bf16_original': False, + 'HasMFMA_constSrc': False, + 'HasMFMA_f64': False, + 'HasMFMA_f8': False, + 'HasMFMA_i8_908': False, + 'HasMFMA_i8_940': False, + 'HasMFMA_vgpr': False, + 'HasMFMA_xf32': False, + 'HasSMulHi': True, + 'HasWMMA': True, + 'KernargPreloading': False, + 'MaxLgkmcnt': 15, + 'MaxVmcnt': 63, + 'SupportedISA': True, + 'SupportedSource': True, + 'VOP3v_dot4_i32_i8': False, + 'v_dot2_f32_f16': True, + 'v_dot2c_f32_f16': True, + 'v_dot4_i32_i8': False, + 'v_dot4c_i32_i8': False, + 'v_fma_f16': True, + 'v_fma_f32': True, + 'v_fma_f64': True, + 'v_fma_mix_f32': True, + 'v_fmac_f16': False, + 'v_fmac_f32': True, + 'v_mac_f16': False, + 'v_mac_f32': False, + 'v_mad_mix_f32': False, + 'v_mov_b64': False, + 'v_pk_fma_f16': True, + 'v_pk_fmac_f16': False}, 'v_dot4_i32_i8': False, 'v_dot4c_i32_i8': False, 'v_fma_f16': True, diff --git a/Tensile/Common.py b/Tensile/Common.py index 9370c3ef09d4..8b6b43111877 100644 --- a/Tensile/Common.py +++ b/Tensile/Common.py @@ -248,9 +248,9 @@ globalParameters["MaxFileName"] = 64 # If a file name would be long globalParameters["SupportedISA"] = [(8,0,3), (9,0,0), (9,0,6), (9,0,8), (9,0,10), (9,4,2), (9,5,0), - (10,1,0), (10,1,1), (10,1,2), (10,3,0), (10,3,1), (10,3,2), + (10,1,0), (10,1,1), (10,1,2), (10,3,0), (10,3,1), (10,3,2), (10,3,5), (11,0,0), (11,0,1), (11,0,2), (11,0,3), - (11,5,0), (11,5,1), + (11,5,0), (11,5,1), (11,5,2), (12,0,0), (12,0,1)] # assembly kernels writer supports these architectures globalParameters["KeepBuildTmp"] = True # Do not remove build artifacts during the build process or build_tmp after build completes @@ -326,7 +326,7 @@ architectureMap = { 'gfx1010':'navi10', 'gfx1011':'navi12', 'gfx1012':'navi14', 'gfx1030':'navi21', 'gfx1031':'navi22', 'gfx1032':'navi23', 'gfx1034':'navi24', 'gfx1035':'rembrandt', 'gfx1100':'navi31', 'gfx1101':'navi32', 'gfx1102':'navi33', 'gfx1103':'gfx1103', - 'gfx1150':'strixpoint', 'gfx1151':'strixhalo', + 'gfx1150':'strixpoint', 'gfx1151':'strixhalo','gfx1152':'gfx1152', 'gfx1200':'gfx1200', 'gfx1201':'gfx1201' } @@ -2466,7 +2466,7 @@ def assignGlobalParameters( config, capabilitiesCache: Optional[dict] = None ): if os.name == "nt": globalParameters["CurrentISA"] = (9,0,6) printWarning("Failed to detect ISA so forcing (gfx906) on windows") - isasWithDisabledHWMonitor = ((9,4,2), (9,5,0), (11,0,0), (11,0,1), (11,0,2), (11,0,3), (11,5,0), (11,5,1), (12,0,0), (12,0,1)) + isasWithDisabledHWMonitor = ((9,4,2), (9,5,0), (11,0,0), (11,0,1), (11,0,2), (11,0,3), (11,5,0), (11,5,1), (11,5,2), (12,0,0), (12,0,1)) if globalParameters["CurrentISA"] in isasWithDisabledHWMonitor: isaString = ', '.join(map(gfxName, isasWithDisabledHWMonitor)) printWarning(f"HardwareMonitor currently disabled for {isaString}") diff --git a/Tensile/Source/lib/include/Tensile/AMDGPU.hpp b/Tensile/Source/lib/include/Tensile/AMDGPU.hpp index 317250db16c4..1dc9b82d8c3a 100644 --- a/Tensile/Source/lib/include/Tensile/AMDGPU.hpp +++ b/Tensile/Source/lib/include/Tensile/AMDGPU.hpp @@ -76,6 +76,7 @@ namespace Tensile gfx1103 = 1103, gfx1150 = 1150, gfx1151 = 1151, + gfx1152 = 1152, gfx1200 = 1200, gfx1201 = 1201 }; @@ -126,13 +127,15 @@ namespace Tensile return "gfx1150"; case AMDGPU::Processor::gfx1151: return "gfx1151"; + case AMDGPU::Processor::gfx1152: + return "gfx1152"; case AMDGPU::Processor::gfx1200: return "gfx1200"; case AMDGPU::Processor::gfx1201: return "gfx1201"; - } - return ""; - } + } + return ""; + } AMDGPU::Processor toProcessorId(std::string const& deviceString) { @@ -212,6 +215,22 @@ namespace Tensile { return AMDGPU::Processor::gfx1201; } + else if(deviceString.find("gfx1103") != std::string::npos) + { + return AMDGPU::Processor::gfx1103; + } + else if(deviceString.find("gfx1150") != std::string::npos) + { + return AMDGPU::Processor::gfx1150; + } + else if(deviceString.find("gfx1151") != std::string::npos) + { + return AMDGPU::Processor::gfx1151; + } + else if(deviceString.find("gfx1152") != std::string::npos) + { + return AMDGPU::Processor::gfx1152; + } else { return static_cast(0); diff --git a/Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp b/Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp index a21e584d291a..cb1c085258c9 100644 --- a/Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp +++ b/Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp @@ -57,6 +57,10 @@ namespace Tensile gfx1100, gfx1101, gfx1102, + gfx1103, + gfx1151, + gfx1150, + gfx1152, gfx1103, gfx1150, gfx1151, @@ -118,10 +122,17 @@ namespace Tensile return "TensileLibrary_*_gfx1200"; case LazyLoadingInit::gfx1201: return "TensileLibrary_*_gfx1201"; - case LazyLoadingInit::None: - return ""; + case LazyLoadingInit::gfx1103: + return "TensileLibrary_*_gfx1103"; + case LazyLoadingInit::gfx1150: + return "TensileLibrary_*_gfx1150"; + case LazyLoadingInit::gfx1151: + return "TensileLibrary_*_gfx1151"; + case LazyLoadingInit::gfx1152: + return "TensileLibrary_*_gfx1152"; + case LazyLoadingInit::None: + return ""; } - return ""; } template -- 2.52.0